diff options
author | hrs <hrs@FreeBSD.org> | 2011-06-16 12:24:02 +0000 |
---|---|---|
committer | hrs <hrs@FreeBSD.org> | 2011-06-16 12:24:02 +0000 |
commit | 7afd303ca9e45353e1af220a988ddac723fde355 (patch) | |
tree | 356c3cc3903b4ba969f103c2c6ccc21fc748bdaa /sys | |
parent | b2a6f5600367e61c439c79db2317d4619960e2ce (diff) | |
parent | cc8fba8d7c933e32931cfc80150a2ed465e70825 (diff) | |
download | FreeBSD-src-7afd303ca9e45353e1af220a988ddac723fde355.zip FreeBSD-src-7afd303ca9e45353e1af220a988ddac723fde355.tar.gz |
Merge from HEAD@222977.
Diffstat (limited to 'sys')
81 files changed, 4697 insertions, 1500 deletions
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index c43cd33..4bfcfa3 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -295,6 +295,7 @@ options USB_DEBUG # enable debug msgs device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device ehci # EHCI PCI->USB interface (USB 2.0) +device xhci # XHCI PCI->USB interface (USB 3.0) device usb # USB Bus (required) #device udbp # USB Double Bulk Pipe devices (needs netgraph) device uhid # "Human Interface Devices" @@ -337,3 +338,11 @@ device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC 2734,3146) device dcons # Dumb console driver device dcons_crom # Configuration ROM for dcons + +# Sound support +device sound # Generic sound driver (required) +device snd_es137x # Ensoniq AudioPCI ES137x +device snd_hda # Intel High Definition Audio +device snd_ich # Intel, NVidia and other ICH AC'97 Audio +device snd_uaudio # USB Audio +device snd_via8233 # VIA VT8233x Audio diff --git a/sys/boot/forth/loader.rc b/sys/boot/forth/loader.rc index 6443f3f..0f9d37e 100644 --- a/sys/boot/forth/loader.rc +++ b/sys/boot/forth/loader.rc @@ -10,8 +10,5 @@ start \ Tests for password -- executes autoboot first if a password was defined check-password -\ Load in the boot menu -include /boot/beastie.4th +\ Unless set otherwise, autoboot is automatic at this point -\ Start the boot menu -beastie-start diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile index 06ff863..65df86f 100644 --- a/sys/boot/i386/zfsboot/Makefile +++ b/sys/boot/i386/zfsboot/Makefile @@ -15,7 +15,7 @@ ORG1= 0x7c00 ORG2= 0x2000 CFLAGS= -DBOOTPROG=\"zfsboot\" \ - -Os -g \ + -Os \ -fno-guess-branch-probability \ -fomit-frame-pointer \ -fno-unit-at-a-time \ diff --git a/sys/cam/ata/ata_all.c b/sys/cam/ata/ata_all.c index 3737e8f..560eef4 100644 --- a/sys/cam/ata/ata_all.c +++ b/sys/cam/ata/ata_all.c @@ -270,6 +270,7 @@ ata_print_ident(struct ata_params *ident_data) sizeof(revision)); printf("<%s %s> %s-%d", product, revision, + (ident_data->config == ATA_PROTO_CFA) ? "CFA" : (ident_data->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA", ata_version(ident_data->version_major)); if (ident_data->satacapabilities && ident_data->satacapabilities != 0xffff) { diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index 8e93302..f1a9433 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -812,6 +812,25 @@ adasysctlinit(void *context, int pending) cam_periph_release(periph); } +static int +adagetattr(struct bio *bp) +{ + int ret = -1; + struct cam_periph *periph; + + if (bp->bio_disk == NULL || bp->bio_disk->d_drv1) + return ENXIO; + periph = (struct cam_periph *)bp->bio_disk->d_drv1; + if (periph->path == NULL) + return ENXIO; + + ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, + periph->path); + if (ret == 0) + bp->bio_completed = bp->bio_length; + return ret; +} + static cam_status adaregister(struct cam_periph *periph, void *arg) { @@ -917,6 +936,7 @@ adaregister(struct cam_periph *periph, void *arg) softc->disk->d_open = adaopen; softc->disk->d_close = adaclose; softc->disk->d_strategy = adastrategy; + softc->disk->d_getattr = adagetattr; softc->disk->d_dump = adadump; softc->disk->d_name = "ada"; softc->disk->d_drv1 = periph; @@ -938,8 +958,6 @@ adaregister(struct cam_periph *periph, void *arg) ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))) softc->disk->d_flags |= DISKFLAG_CANDELETE; - strlcpy(softc->disk->d_ident, cgd->serial_num, - MIN(sizeof(softc->disk->d_ident), cgd->serial_num_len + 1)); strlcpy(softc->disk->d_descr, cgd->ident_data.model, MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model))); softc->disk->d_hba_vendor = cpi.hba_vendor; diff --git a/sys/cam/ata/ata_xpt.c b/sys/cam/ata/ata_xpt.c index 9e236a4..d02b36f 100644 --- a/sys/cam/ata/ata_xpt.c +++ b/sys/cam/ata/ata_xpt.c @@ -1583,12 +1583,14 @@ ata_device_transport(struct cam_path *path) cts.proto_specific.valid = 0; if (ident_buf) { if (path->device->transport == XPORT_ATA) { - cts.xport_specific.ata.atapi = + cts.xport_specific.ata.atapi = + (ident_buf->config == ATA_PROTO_CFA) ? 0 : ((ident_buf->config & ATA_PROTO_MASK) == ATA_PROTO_ATAPI_16) ? 16 : ((ident_buf->config & ATA_PROTO_MASK) == ATA_PROTO_ATAPI_12) ? 12 : 0; cts.xport_specific.ata.valid = CTS_ATA_VALID_ATAPI; } else { - cts.xport_specific.sata.atapi = + cts.xport_specific.sata.atapi = + (ident_buf->config == ATA_PROTO_CFA) ? 0 : ((ident_buf->config & ATA_PROTO_MASK) == ATA_PROTO_ATAPI_16) ? 16 : ((ident_buf->config & ATA_PROTO_MASK) == ATA_PROTO_ATAPI_12) ? 12 : 0; cts.xport_specific.sata.valid = CTS_SATA_VALID_ATAPI; @@ -1638,7 +1640,9 @@ ata_action(union ccb *start_ccb) uint16_t p = device->ident_data.config & ATA_PROTO_MASK; - maxlen = (p == ATA_PROTO_ATAPI_16) ? 16 : + maxlen = + (device->ident_data.config == ATA_PROTO_CFA) ? 0 : + (p == ATA_PROTO_ATAPI_16) ? 16 : (p == ATA_PROTO_ATAPI_12) ? 12 : 0; } if (start_ccb->csio.cdb_len > maxlen) { diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h index 981a5ed..ed2a890 100644 --- a/sys/cam/cam_ccb.h +++ b/sys/cam/cam_ccb.h @@ -144,8 +144,8 @@ typedef enum { /* Device statistics (error counts, etc.) */ XPT_FREEZE_QUEUE = 0x0d, /* Freeze device queue */ - XPT_GDEV_ADVINFO = 0x0e, - /* Advanced device information */ + XPT_DEV_ADVINFO = 0x0e, + /* Get/Set Device advanced information */ /* SCSI Control Functions: 0x10->0x1F */ XPT_ABORT = 0x10, /* Abort the specified CCB */ @@ -391,15 +391,24 @@ typedef enum { DEV_MATCH_TARGET = 0x002, DEV_MATCH_LUN = 0x004, DEV_MATCH_INQUIRY = 0x008, + DEV_MATCH_DEVID = 0x010, DEV_MATCH_ANY = 0x00f } dev_pattern_flags; +struct device_id_match_pattern { + uint8_t id_len; + uint8_t id[256]; +}; + struct device_match_pattern { - path_id_t path_id; - target_id_t target_id; - lun_id_t target_lun; - struct scsi_static_inquiry_pattern inq_pat; - dev_pattern_flags flags; + path_id_t path_id; + target_id_t target_id; + lun_id_t target_lun; + dev_pattern_flags flags; + union { + struct scsi_static_inquiry_pattern inq_pat; + struct device_id_match_pattern devid_pat; + } data; }; typedef enum { @@ -745,6 +754,7 @@ struct ccb_relsim { * Definitions for the asynchronous callback CCB fields. */ typedef enum { + AC_ADVINFO_CHANGED = 0x2000,/* Advance info might have changes */ AC_CONTRACT = 0x1000,/* A contractual callback */ AC_GETDEV_CHANGED = 0x800,/* Getdev info might have changed */ AC_INQ_CHANGED = 0x400,/* Inquiry info might have changed */ @@ -1094,19 +1104,20 @@ struct ccb_eng_exec { /* This structure must match SCSIIO size */ #define XPT_CCB_INVALID -1 /* for signaling a bad CCB to free */ /* - * CCB for getting advanced device information. This operates in a fashion + * CCB for working with advanced device information. This operates in a fashion * similar to XPT_GDEV_TYPE. Specify the target in ccb_h, the buffer * type requested, and provide a buffer size/buffer to write to. If the - * buffer is too small, the handler will set GDEVAI_FLAG_MORE. + * buffer is too small, provsiz will be larger than bufsiz. */ -struct ccb_getdev_advinfo { +struct ccb_dev_advinfo { struct ccb_hdr ccb_h; uint32_t flags; -#define CGDAI_FLAG_TRANSPORT 0x1 -#define CGDAI_FLAG_PROTO 0x2 +#define CDAI_FLAG_STORE 0x1 /* If set, action becomes store */ uint32_t buftype; /* IN: Type of data being requested */ /* NB: buftype is interpreted on a per-transport basis */ -#define CGDAI_TYPE_SCSI_DEVID 1 +#define CDAI_TYPE_SCSI_DEVID 1 +#define CDAI_TYPE_SERIAL_NUM 2 +#define CDAI_TYPE_PHYS_PATH 3 off_t bufsiz; /* IN: Size of external buffer */ #define CAM_SCSI_DEVID_MAXLEN 65536 /* length in buffer is an uint16_t */ off_t provsiz; /* OUT: Size required/used */ @@ -1151,7 +1162,7 @@ union ccb { struct ccb_rescan crcn; struct ccb_debug cdbg; struct ccb_ataio ataio; - struct ccb_getdev_advinfo cgdai; + struct ccb_dev_advinfo cdai; }; __BEGIN_DECLS diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index dd51bca..f630772 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/devicestat.h> #include <sys/bus.h> +#include <sys/sbuf.h> #include <vm/vm.h> #include <vm/vm_extern.h> @@ -303,6 +304,38 @@ cam_periph_find(struct cam_path *path, char *name) return(NULL); } +/* + * Find a peripheral structure with the specified path, target, lun, + * and (optionally) type. If the name is NULL, this function will return + * the first peripheral driver that matches the specified path. + */ +int +cam_periph_list(struct cam_path *path, struct sbuf *sb) +{ + struct periph_driver **p_drv; + struct cam_periph *periph; + int count; + + count = 0; + xpt_lock_buses(); + for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { + + TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) { + if (xpt_path_comp(periph->path, path) != 0) + continue; + + if (sbuf_len(sb) != 0) + sbuf_cat(sb, ","); + + sbuf_printf(sb, "%s%d", periph->periph_name, + periph->unit_number); + count++; + } + } + xpt_unlock_buses(); + return (count); +} + cam_status cam_periph_acquire(struct cam_periph *periph) { @@ -654,12 +687,12 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) dirs[1] = CAM_DIR_IN; numbufs = 2; break; - case XPT_GDEV_ADVINFO: - if (ccb->cgdai.bufsiz == 0) + case XPT_DEV_ADVINFO: + if (ccb->cdai.bufsiz == 0) return (0); - data_ptrs[0] = (uint8_t **)&ccb->cgdai.buf; - lengths[0] = ccb->cgdai.bufsiz; + data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; + lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; @@ -813,9 +846,9 @@ cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) data_ptrs[0] = &ccb->smpio.smp_request; data_ptrs[1] = &ccb->smpio.smp_response; break; - case XPT_GDEV_ADVINFO: + case XPT_DEV_ADVINFO: numbufs = min(mapinfo->num_bufs_used, 1); - data_ptrs[0] = (uint8_t **)&ccb->cgdai.buf; + data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; break; default: /* allow ourselves to be swapped once again */ diff --git a/sys/cam/cam_periph.h b/sys/cam/cam_periph.h index 33e9f75..58bfd7b 100644 --- a/sys/cam/cam_periph.h +++ b/sys/cam/cam_periph.h @@ -142,6 +142,7 @@ cam_status cam_periph_alloc(periph_ctor_t *periph_ctor, char *name, cam_periph_type type, struct cam_path *, ac_callback_t *, ac_code, void *arg); struct cam_periph *cam_periph_find(struct cam_path *path, char *name); +int cam_periph_list(struct cam_path *, struct sbuf *); cam_status cam_periph_acquire(struct cam_periph *periph); void cam_periph_release(struct cam_periph *periph); void cam_periph_release_locked(struct cam_periph *periph); @@ -200,5 +201,12 @@ cam_periph_owned(struct cam_periph *periph) return (mtx_owned(periph->sim->mtx)); } +static __inline int +cam_periph_sleep(struct cam_periph *periph, void *chan, int priority, + const char *wmesg, int timo) +{ + return (msleep(chan, periph->sim->mtx, priority, wmesg, timo)); +} + #endif /* _KERNEL */ #endif /* _CAM_CAM_PERIPH_H */ diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c index 1ce205d..f234076 100644 --- a/sys/cam/cam_xpt.c +++ b/sys/cam/cam_xpt.c @@ -287,9 +287,6 @@ static xpt_targetfunc_t xptdeftargetfunc; static xpt_devicefunc_t xptdefdevicefunc; static xpt_periphfunc_t xptdefperiphfunc; static void xpt_finishconfig_task(void *context, int pending); -static int xpt_for_all_busses(xpt_busfunc_t *tr_func, void *arg); -static int xpt_for_all_devices(xpt_devicefunc_t *tr_func, - void *arg); static void xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target, @@ -1105,6 +1102,44 @@ xpt_announce_periph(struct cam_periph *periph, char *announce_string) periph->unit_number, announce_string); } +int +xpt_getattr(char *buf, size_t len, const char *attr, struct cam_path *path) +{ + int ret = -1; + struct ccb_dev_advinfo cdai; + + memset(&cdai, 0, sizeof(cdai)); + xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL); + cdai.ccb_h.func_code = XPT_DEV_ADVINFO; + cdai.bufsiz = len; + + if (!strcmp(attr, "GEOM::ident")) + cdai.buftype = CDAI_TYPE_SERIAL_NUM; + else if (!strcmp(attr, "GEOM::physpath")) + cdai.buftype = CDAI_TYPE_PHYS_PATH; + else + goto out; + + cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT|M_ZERO); + if (cdai.buf == NULL) { + ret = ENOMEM; + goto out; + } + xpt_action((union ccb *)&cdai); /* can only be synchronous */ + if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) + cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); + if (cdai.provsiz == 0) + goto out; + ret = 0; + if (strlcpy(buf, cdai.buf, len) >= len) + ret = EFAULT; + +out: + if (cdai.buf != NULL) + free(cdai.buf, M_CAMXPT); + return ret; +} + static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_eb *bus) @@ -1241,6 +1276,7 @@ xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, for (i = 0; i < num_patterns; i++) { struct device_match_pattern *cur_pattern; + struct scsi_vpd_device_id *device_id_page; /* * If the pattern in question isn't for a device node, we @@ -1255,22 +1291,17 @@ xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, cur_pattern = &patterns[i].pattern.device_pattern; + /* Error out if mutually exclusive options are specified. */ + if ((cur_pattern->flags & (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) + == (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) + return(DM_RET_ERROR); + /* * If they want to match any device node, we give them any * device node. */ - if (cur_pattern->flags == DEV_MATCH_ANY) { - /* set the copy flag */ - retval |= DM_RET_COPY; - - - /* - * If we've already decided on an action, go ahead - * and return. - */ - if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE) - return(retval); - } + if (cur_pattern->flags == DEV_MATCH_ANY) + goto copy_dev_node; /* * Not sure why someone would do this... @@ -1292,11 +1323,22 @@ xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0) && (cam_quirkmatch((caddr_t)&device->inq_data, - (caddr_t)&cur_pattern->inq_pat, - 1, sizeof(cur_pattern->inq_pat), + (caddr_t)&cur_pattern->data.inq_pat, + 1, sizeof(cur_pattern->data.inq_pat), scsi_static_inquiry_match) == NULL)) continue; + device_id_page = (struct scsi_vpd_device_id *)device->device_id; + if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0) + && (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN + || scsi_devid_match((uint8_t *)device_id_page->desc_list, + device->device_id_len + - SVPD_DEVICE_ID_HDR_LEN, + cur_pattern->data.devid_pat.id, + cur_pattern->data.devid_pat.id_len) != 0)) + continue; + +copy_dev_node: /* * If we get to this point, the user definitely wants * information on this device. So tell the caller to copy @@ -2889,6 +2931,8 @@ xpt_action_default(union ccb *start_ccb) case XPT_TERM_IO: case XPT_ENG_INQ: /* XXX Implement */ + printf("%s: CCB type %#x not supported\n", __func__, + start_ccb->ccb_h.func_code); start_ccb->ccb_h.status = CAM_PROVIDE_FAIL; if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) { xpt_done(start_ccb); @@ -3528,16 +3572,12 @@ xpt_path_string(struct cam_path *path, char *str, size_t str_len) path_id_t xpt_path_path_id(struct cam_path *path) { - mtx_assert(path->bus->sim->mtx, MA_OWNED); - return(path->bus->path_id); } target_id_t xpt_path_target_id(struct cam_path *path) { - mtx_assert(path->bus->sim->mtx, MA_OWNED); - if (path->target != NULL) return (path->target->target_id); else @@ -3547,8 +3587,6 @@ xpt_path_target_id(struct cam_path *path) lun_id_t xpt_path_lun_id(struct cam_path *path) { - mtx_assert(path->bus->sim->mtx, MA_OWNED); - if (path->device != NULL) return (path->device->lun_id); else @@ -4242,7 +4280,8 @@ xpt_alloc_target(struct cam_eb *bus, target_id_t target_id) { struct cam_et *target; - target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT, M_NOWAIT); + target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT, + M_NOWAIT|M_ZERO); if (target != NULL) { struct cam_et *cur_target; @@ -4330,7 +4369,7 @@ xpt_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id) device = NULL; } else { device = (struct cam_ed *)malloc(sizeof(*device), - M_CAMXPT, M_NOWAIT); + M_CAMXPT, M_NOWAIT|M_ZERO); } if (device != NULL) { @@ -4676,27 +4715,29 @@ xpt_register_async(int event, ac_callback_t *cbfunc, void *cbarg, csa.callback_arg = cbarg; xpt_action((union ccb *)&csa); status = csa.ccb_h.status; + if (xptpath) { xpt_free_path(path); mtx_unlock(&xsoftc.xpt_lock); + } - if ((status == CAM_REQ_CMP) && - (csa.event_enable & AC_FOUND_DEVICE)) { - /* - * Get this peripheral up to date with all - * the currently existing devices. - */ - xpt_for_all_devices(xptsetasyncfunc, &csa); - } - if ((status == CAM_REQ_CMP) && - (csa.event_enable & AC_PATH_REGISTERED)) { - /* - * Get this peripheral up to date with all - * the currently existing busses. - */ - xpt_for_all_busses(xptsetasyncbusfunc, &csa); - } + if ((status == CAM_REQ_CMP) && + (csa.event_enable & AC_FOUND_DEVICE)) { + /* + * Get this peripheral up to date with all + * the currently existing devices. + */ + xpt_for_all_devices(xptsetasyncfunc, &csa); } + if ((status == CAM_REQ_CMP) && + (csa.event_enable & AC_PATH_REGISTERED)) { + /* + * Get this peripheral up to date with all + * the currently existing busses. + */ + xpt_for_all_busses(xptsetasyncbusfunc, &csa); + } + return (status); } @@ -4852,8 +4893,10 @@ camisr_runqueue(void *V_queue) if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0 && (--dev->tag_delay_count == 0)) xpt_start_tags(ccb_h->path); - if (!device_is_send_queued(dev)) - xpt_schedule_dev_sendq(ccb_h->path->bus, dev); + if (!device_is_send_queued(dev)) { + runq = xpt_schedule_dev_sendq(ccb_h->path->bus, + dev); + } } if (ccb_h->status & CAM_RELEASE_SIMQ) { diff --git a/sys/cam/cam_xpt.h b/sys/cam/cam_xpt.h index 9355be4..f7d9b42 100644 --- a/sys/cam/cam_xpt.h +++ b/sys/cam/cam_xpt.h @@ -103,6 +103,8 @@ cam_status xpt_create_path_unlocked(struct cam_path **new_path_ptr, struct cam_periph *perph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id); +int xpt_getattr(char *buf, size_t len, const char *attr, + struct cam_path *path); void xpt_free_path(struct cam_path *path); int xpt_path_comp(struct cam_path *path1, struct cam_path *path2); diff --git a/sys/cam/cam_xpt_internal.h b/sys/cam/cam_xpt_internal.h index f485e37..b6e8f66 100644 --- a/sys/cam/cam_xpt_internal.h +++ b/sys/cam/cam_xpt_internal.h @@ -97,6 +97,8 @@ struct cam_ed { uint8_t supported_vpds_len; uint32_t device_id_len; uint8_t *device_id; + uint8_t physpath_len; + uint8_t *physpath; /* physical path string form */ struct ata_params ident_data; u_int8_t inq_flags; /* * Current settings for inquiry flags. diff --git a/sys/cam/scsi/scsi_all.c b/sys/cam/scsi/scsi_all.c index 7ededa1..7361c42 100644 --- a/sys/cam/scsi/scsi_all.c +++ b/sys/cam/scsi/scsi_all.c @@ -3552,32 +3552,63 @@ scsi_calc_syncparam(u_int period) return (period/400); } -uint8_t * -scsi_get_sas_addr(struct scsi_vpd_device_id *id, uint32_t len) +int +scsi_devid_is_naa_ieee_reg(uint8_t *bufp) { - uint8_t *bufp, *buf_end; struct scsi_vpd_id_descriptor *descr; struct scsi_vpd_id_naa_basic *naa; - bufp = buf_end = (uint8_t *)id; - bufp += SVPD_DEVICE_ID_HDR_LEN; - buf_end += len; - while (bufp < buf_end) { - descr = (struct scsi_vpd_id_descriptor *)bufp; - bufp += SVPD_DEVICE_ID_DESC_HDR_LEN; - /* Right now, we only care about SAS NAA IEEE Reg addrs */ - if (((descr->id_type & SVPD_ID_PIV) != 0) - && (descr->proto_codeset >> SVPD_ID_PROTO_SHIFT) == - SCSI_PROTO_SAS - && (descr->id_type & SVPD_ID_TYPE_MASK) == SVPD_ID_TYPE_NAA){ - naa = (struct scsi_vpd_id_naa_basic *)bufp; - if ((naa->naa >> 4) == SVPD_ID_NAA_IEEE_REG) - return bufp; - } - bufp += descr->length; + descr = (struct scsi_vpd_id_descriptor *)bufp; + naa = (struct scsi_vpd_id_naa_basic *)descr->identifier; + if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) + return 0; + if (descr->length < sizeof(struct scsi_vpd_id_naa_ieee_reg)) + return 0; + if ((naa->naa >> SVPD_ID_NAA_NAA_SHIFT) != SVPD_ID_NAA_IEEE_REG) + return 0; + return 1; +} + +int +scsi_devid_is_sas_target(uint8_t *bufp) +{ + struct scsi_vpd_id_descriptor *descr; + + descr = (struct scsi_vpd_id_descriptor *)bufp; + if (!scsi_devid_is_naa_ieee_reg(bufp)) + return 0; + if ((descr->id_type & SVPD_ID_PIV) == 0) /* proto field reserved */ + return 0; + if ((descr->proto_codeset >> SVPD_ID_PROTO_SHIFT) != SCSI_PROTO_SAS) + return 0; + return 1; +} + +uint8_t * +scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t page_len, + scsi_devid_checkfn_t ck_fn) +{ + struct scsi_vpd_id_descriptor *desc; + uint8_t *page_end; + uint8_t *desc_buf_end; + + page_end = (uint8_t *)id + page_len; + if (page_end < id->desc_list) + return (NULL); + + desc_buf_end = MIN(id->desc_list + scsi_2btoul(id->length), page_end); + + for (desc = (struct scsi_vpd_id_descriptor *)id->desc_list; + desc->identifier <= desc_buf_end + && desc->identifier + desc->length <= desc_buf_end; + desc = (struct scsi_vpd_id_descriptor *)(desc->identifier + + desc->length)) { + + if (ck_fn == NULL || ck_fn((uint8_t *)desc) != 0) + return (desc->identifier); } - return NULL; + return (NULL); } void @@ -4174,6 +4205,77 @@ scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, timeout); } +void +scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb*), + uint8_t tag_action, int pcv, uint8_t page_code, + uint8_t *data_ptr, uint16_t allocation_length, + uint8_t sense_len, uint32_t timeout) +{ + struct scsi_receive_diag *scsi_cmd; + + scsi_cmd = (struct scsi_receive_diag *)&csio->cdb_io.cdb_bytes; + memset(scsi_cmd, 0, sizeof(*scsi_cmd)); + scsi_cmd->opcode = RECEIVE_DIAGNOSTIC; + if (pcv) { + scsi_cmd->byte2 |= SRD_PCV; + scsi_cmd->page_code = page_code; + } + scsi_ulto2b(allocation_length, scsi_cmd->length); + + cam_fill_csio(csio, + retries, + cbfcnp, + /*flags*/CAM_DIR_IN, + tag_action, + data_ptr, + allocation_length, + sense_len, + sizeof(*scsi_cmd), + timeout); +} + +void +scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + uint8_t tag_action, int unit_offline, int device_offline, + int self_test, int page_format, int self_test_code, + uint8_t *data_ptr, uint16_t param_list_length, + uint8_t sense_len, uint32_t timeout) +{ + struct scsi_send_diag *scsi_cmd; + + scsi_cmd = (struct scsi_send_diag *)&csio->cdb_io.cdb_bytes; + memset(scsi_cmd, 0, sizeof(*scsi_cmd)); + scsi_cmd->opcode = SEND_DIAGNOSTIC; + + /* + * The default self-test mode control and specific test + * control are mutually exclusive. + */ + if (self_test) + self_test_code = SSD_SELF_TEST_CODE_NONE; + + scsi_cmd->byte2 = ((self_test_code << SSD_SELF_TEST_CODE_SHIFT) + & SSD_SELF_TEST_CODE_MASK) + | (unit_offline ? SSD_UNITOFFL : 0) + | (device_offline ? SSD_DEVOFFL : 0) + | (self_test ? SSD_SELFTEST : 0) + | (page_format ? SSD_PF : 0); + scsi_ulto2b(param_list_length, scsi_cmd->length); + + cam_fill_csio(csio, + retries, + cbfcnp, + /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE, + tag_action, + data_ptr, + param_list_length, + sense_len, + sizeof(*scsi_cmd), + timeout); +} + void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), @@ -4206,7 +4308,6 @@ scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, sense_len, sizeof(*scsi_cmd), timeout); - } @@ -4264,6 +4365,66 @@ scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry) return (-1); } +/** + * Compare two buffers of vpd device descriptors for a match. + * + * \param lhs Pointer to first buffer of descriptors to compare. + * \param lhs_len The length of the first buffer. + * \param rhs Pointer to second buffer of descriptors to compare. + * \param rhs_len The length of the second buffer. + * + * \return 0 on a match, -1 otherwise. + * + * Treat rhs and lhs as arrays of vpd device id descriptors. Walk lhs matching + * agains each element in rhs until all data are exhausted or we have found + * a match. + */ +int +scsi_devid_match(uint8_t *lhs, size_t lhs_len, uint8_t *rhs, size_t rhs_len) +{ + struct scsi_vpd_id_descriptor *lhs_id; + struct scsi_vpd_id_descriptor *lhs_last; + struct scsi_vpd_id_descriptor *rhs_last; + uint8_t *lhs_end; + uint8_t *rhs_end; + + lhs_end = lhs + lhs_len; + rhs_end = rhs + rhs_len; + + /* + * rhs_last and lhs_last are the last posible position of a valid + * descriptor assuming it had a zero length identifier. We use + * these variables to insure we can safely dereference the length + * field in our loop termination tests. + */ + lhs_last = (struct scsi_vpd_id_descriptor *) + (lhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); + rhs_last = (struct scsi_vpd_id_descriptor *) + (rhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); + + lhs_id = (struct scsi_vpd_id_descriptor *)lhs; + while (lhs_id <= lhs_last + && (lhs_id->identifier + lhs_id->length) <= lhs_end) { + struct scsi_vpd_id_descriptor *rhs_id; + + rhs_id = (struct scsi_vpd_id_descriptor *)rhs; + while (rhs_id <= rhs_last + && (rhs_id->identifier + rhs_id->length) <= rhs_end) { + + if (rhs_id->length == lhs_id->length + && memcmp(rhs_id->identifier, lhs_id->identifier, + rhs_id->length) == 0) + return (0); + + rhs_id = (struct scsi_vpd_id_descriptor *) + (rhs_id->identifier + rhs_id->length); + } + lhs_id = (struct scsi_vpd_id_descriptor *) + (lhs_id->identifier + lhs_id->length); + } + return (-1); +} + #ifdef _KERNEL static void init_scsi_delay(void) diff --git a/sys/cam/scsi/scsi_all.h b/sys/cam/scsi/scsi_all.h index 0a7a58f..93b11d5 100644 --- a/sys/cam/scsi/scsi_all.h +++ b/sys/cam/scsi/scsi_all.h @@ -115,6 +115,7 @@ struct scsi_request_sense { u_int8_t opcode; u_int8_t byte2; +#define SRS_DESC 0x01 u_int8_t unused[2]; u_int8_t length; u_int8_t control; @@ -128,17 +129,33 @@ struct scsi_test_unit_ready u_int8_t control; }; -struct scsi_send_diag -{ - u_int8_t opcode; - u_int8_t byte2; -#define SSD_UOL 0x01 -#define SSD_DOL 0x02 -#define SSD_SELFTEST 0x04 -#define SSD_PF 0x10 - u_int8_t unused[1]; - u_int8_t paramlen[2]; - u_int8_t control; +struct scsi_receive_diag { + uint8_t opcode; + uint8_t byte2; +#define SRD_PCV 0x01 + uint8_t page_code; + uint8_t length[2]; + uint8_t control; +}; + +struct scsi_send_diag { + uint8_t opcode; + uint8_t byte2; +#define SSD_UNITOFFL 0x01 +#define SSD_DEVOFFL 0x02 +#define SSD_SELFTEST 0x04 +#define SSD_PF 0x10 +#define SSD_SELF_TEST_CODE_MASK 0xE0 +#define SSD_SELF_TEST_CODE_SHIFT 5 +#define SSD_SELF_TEST_CODE_NONE 0x00 +#define SSD_SELF_TEST_CODE_BG_SHORT 0x01 +#define SSD_SELF_TEST_CODE_BG_EXTENDED 0x02 +#define SSD_SELF_TEST_CODE_BG_ABORT 0x04 +#define SSD_SELF_TEST_CODE_FG_SHORT 0x05 +#define SSD_SELF_TEST_CODE_FG_EXTENDED 0x06 + uint8_t reserved; + uint8_t length[2]; + uint8_t control; }; struct scsi_sense @@ -894,11 +911,12 @@ struct scsi_vpd_id_naa_basic uint8_t naa : 4; uint8_t naa_desig : 4; */ +#define SVPD_ID_NAA_NAA_SHIFT 4 #define SVPD_ID_NAA_IEEE_EXT 0x02 #define SVPD_ID_NAA_LOCAL_REG 0x03 #define SVPD_ID_NAA_IEEE_REG 0x05 #define SVPD_ID_NAA_IEEE_REG_EXT 0x06 - uint8_t naa_data[0]; + uint8_t naa_data[]; }; struct scsi_vpd_id_naa_ieee_extended_id @@ -1322,7 +1340,12 @@ void scsi_print_inquiry(struct scsi_inquiry_data *inq_data); u_int scsi_calc_syncsrate(u_int period_factor); u_int scsi_calc_syncparam(u_int period); -uint8_t * scsi_get_sas_addr(struct scsi_vpd_device_id *id, uint32_t len); + +typedef int (*scsi_devid_checkfn_t)(uint8_t *); +int scsi_devid_is_naa_ieee_reg(uint8_t *bufp); +int scsi_devid_is_sas_target(uint8_t *bufp); +uint8_t * scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t len, + scsi_devid_checkfn_t ck_fn); void scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, @@ -1439,6 +1462,22 @@ void scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t begin_lba, u_int16_t lb_count, u_int8_t sense_len, u_int32_t timeout); +void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, + union ccb*), + uint8_t tag_action, int pcv, + uint8_t page_code, uint8_t *data_ptr, + uint16_t allocation_length, + uint8_t sense_len, uint32_t timeout); + +void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, + void (*cbfcnp)(struct cam_periph *, union ccb *), + uint8_t tag_action, int unit_offline, + int device_offline, int self_test, int page_format, + int self_test_code, uint8_t *data_ptr, + uint16_t param_list_length, uint8_t sense_len, + uint32_t timeout); + void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, @@ -1455,6 +1494,8 @@ void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, int scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); int scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); +int scsi_devid_match(uint8_t *rhs, size_t rhs_len, + uint8_t *lhs, size_t lhs_len); static __inline void scsi_extract_sense(struct scsi_sense_data *sense, int *error_code, int *sense_key, diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 9729878..a436318 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/cons.h> +#include <geom/geom.h> #include <geom/geom_disk.h> #endif /* _KERNEL */ @@ -727,7 +728,8 @@ daclose(struct disk *dp) softc = (struct da_softc *)periph->softc; - if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { + if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 + && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { union ccb *ccb; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); @@ -932,6 +934,25 @@ dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t leng return (0); } +static int +dagetattr(struct bio *bp) +{ + int ret = -1; + struct cam_periph *periph; + + if (bp->bio_disk == NULL || bp->bio_disk->d_drv1 == NULL) + return ENXIO; + periph = (struct cam_periph *)bp->bio_disk->d_drv1; + if (periph->path == NULL) + return ENXIO; + + ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, + periph->path); + if (ret == 0) + bp->bio_completed = bp->bio_length; + return ret; +} + static void dainit(void) { @@ -977,7 +998,8 @@ daoninvalidate(struct cam_periph *periph) bioq_flush(&softc->bio_queue, NULL, ENXIO); disk_gone(softc->disk); - xpt_print(periph->path, "lost device\n"); + xpt_print(periph->path, "lost device - %d outstanding\n", + softc->outstanding_cmds); } static void @@ -1044,6 +1066,20 @@ daasync(void *callback_arg, u_int32_t code, && status != CAM_REQ_INPROG) printf("daasync: Unable to attach to new device " "due to status 0x%x\n", status); + return; + } + case AC_ADVINFO_CHANGED: + { + uintptr_t buftype; + + buftype = (uintptr_t)arg; + if (buftype == CDAI_TYPE_PHYS_PATH) { + struct da_softc *softc; + + softc = periph->softc; + disk_attr_changed(softc->disk, "GEOM::physpath", + M_NOWAIT); + } break; } case AC_SENT_BDR: @@ -1060,12 +1096,12 @@ daasync(void *callback_arg, u_int32_t code, softc->flags |= DA_FLAG_RETRY_UA; LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le) ccbh->ccb_state |= DA_CCB_RETRY_UA; - /* FALLTHROUGH*/ + break; } default: - cam_periph_async(periph, code, path, arg); break; } + cam_periph_async(periph, code, path, arg); } static void @@ -1231,17 +1267,6 @@ daregister(struct cam_periph *periph, void *arg) TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph); /* - * Add async callbacks for bus reset and - * bus device reset calls. I don't bother - * checking if this fails as, in most cases, - * the system will function just fine without - * them and the only alternative would be to - * not attach the device on failure. - */ - xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE, - daasync, periph, periph->path); - - /* * Take an exclusive refcount on the periph while dastart is called * to finish the probe. The reference will be dropped in dadone at * the end of probe. @@ -1301,6 +1326,7 @@ daregister(struct cam_periph *periph, void *arg) softc->disk->d_close = daclose; softc->disk->d_strategy = dastrategy; softc->disk->d_dump = dadump; + softc->disk->d_getattr = dagetattr; softc->disk->d_name = "da"; softc->disk->d_drv1 = periph; if (cpi.maxio == 0) @@ -1313,8 +1339,6 @@ daregister(struct cam_periph *periph, void *arg) softc->disk->d_flags = 0; if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; - strlcpy(softc->disk->d_ident, cgd->serial_num, - MIN(sizeof(softc->disk->d_ident), cgd->serial_num_len + 1)); cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); @@ -1328,6 +1352,25 @@ daregister(struct cam_periph *periph, void *arg) disk_create(softc->disk, DISK_VERSION); mtx_lock(periph->sim->mtx); + /* + * Add async callbacks for events of interest. + * I don't bother checking if this fails as, + * in most cases, the system will function just + * fine without them and the only alternative + * would be to not attach the device on failure. + */ + xpt_register_async(AC_SENT_BDR | AC_BUS_RESET + | AC_LOST_DEVICE | AC_ADVINFO_CHANGED, + daasync, periph, periph->path); + + /* + * Emit an attribute changed notification just in case + * physical path information arrived before our async + * event handler was registered, but after anyone attaching + * to our disk device polled it. + */ + disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); + xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); @@ -1558,7 +1601,7 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { int error; int sf; - + if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0) sf = SF_RETRY_UA; else @@ -1573,8 +1616,17 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) return; } if (error != 0) { + int queued_error; + + /* + * return all queued I/O with EIO, so that + * the client can retry these I/Os in the + * proper order should it attempt to recover. + */ + queued_error = EIO; - if (error == ENXIO) { + if (error == ENXIO + && (softc->flags & DA_FLAG_PACK_INVALID)== 0) { /* * Catastrophic error. Mark our pack as * invalid. @@ -1586,14 +1638,10 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) xpt_print(periph->path, "Invalidating pack\n"); softc->flags |= DA_FLAG_PACK_INVALID; + queued_error = ENXIO; } - - /* - * return all queued I/O with EIO, so that - * the client can retry these I/Os in the - * proper order should it attempt to recover. - */ - bioq_flush(&softc->bio_queue, NULL, EIO); + bioq_flush(&softc->bio_queue, NULL, + queued_error); bp->bio_error = error; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; @@ -1626,6 +1674,11 @@ dadone(struct cam_periph *periph, union ccb *done_ccb) if (softc->outstanding_cmds == 0) softc->flags |= DA_FLAG_WENT_IDLE; + if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) { + xpt_print(periph->path, "oustanding %d\n", + softc->outstanding_cmds); + } + biodone(bp); break; } diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c index e7ecb35..a124468 100644 --- a/sys/cam/scsi/scsi_pass.c +++ b/sys/cam/scsi/scsi_pass.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/errno.h> #include <sys/devicestat.h> #include <sys/proc.h> +#include <sys/taskqueue.h> #include <cam/cam.h> #include <cam/cam_ccb.h> @@ -70,12 +71,14 @@ typedef enum { #define ccb_bp ppriv_ptr1 struct pass_softc { - pass_state state; - pass_flags flags; - u_int8_t pd_type; - union ccb saved_ccb; - struct devstat *device_stats; - struct cdev *dev; + pass_state state; + pass_flags flags; + u_int8_t pd_type; + union ccb saved_ccb; + struct devstat *device_stats; + struct cdev *dev; + struct cdev *alias_dev; + struct task add_physpath_task; }; @@ -88,6 +91,7 @@ static periph_ctor_t passregister; static periph_oninv_t passoninvalidate; static periph_dtor_t passcleanup; static periph_start_t passstart; +static void pass_add_physpath(void *context, int pending); static void passasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void passdone(struct cam_periph *periph, @@ -168,17 +172,45 @@ passcleanup(struct cam_periph *periph) if (bootverbose) xpt_print(periph->path, "removing device entry\n"); devstat_remove_entry(softc->device_stats); + cam_periph_unlock(periph); + taskqueue_drain(taskqueue_thread, &softc->add_physpath_task); + /* * passcleanup() is indirectly a d_close method via passclose, * so using destroy_dev(9) directly can result in deadlock. */ destroy_dev_sched(softc->dev); cam_periph_lock(periph); + free(softc, M_DEVBUF); } static void +pass_add_physpath(void *context, int pending) +{ + struct cam_periph *periph; + struct pass_softc *softc; + char *physpath; + + /* + * If we have one, create a devfs alias for our + * physical path. + */ + periph = context; + softc = periph->softc; + physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK); + if (xpt_getattr(physpath, MAXPATHLEN, + "GEOM::physpath", periph->path) == 0 + && strlen(physpath) != 0) { + + make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev, + softc->dev, softc->alias_dev, physpath); + } + free(physpath, M_DEVBUF); +} + +static void passasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { @@ -219,6 +251,20 @@ passasync(void *callback_arg, u_int32_t code, break; } + case AC_ADVINFO_CHANGED: + { + uintptr_t buftype; + + buftype = (uintptr_t)arg; + if (buftype == CDAI_TYPE_PHYS_PATH) { + struct pass_softc *softc; + + softc = (struct pass_softc *)periph->softc; + taskqueue_enqueue(taskqueue_thread, + &softc->add_physpath_task); + } + break; + } default: cam_periph_async(periph, code, path, arg); break; @@ -292,11 +338,22 @@ passregister(struct cam_periph *periph, void *arg) mtx_lock(periph->sim->mtx); softc->dev->si_drv1 = periph; + TASK_INIT(&softc->add_physpath_task, /*priority*/0, + pass_add_physpath, periph); + + /* + * See if physical path information is already available. + */ + taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); + /* - * Add an async callback so that we get - * notified if this device goes away. + * Add an async callback so that we get notified if + * this device goes away or its physical path + * (stored in the advanced info data of the EDT) has + * changed. */ - xpt_register_async(AC_LOST_DEVICE, passasync, periph, periph->path); + xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED, + passasync, periph, periph->path); if (bootverbose) xpt_announce_periph(periph, NULL); @@ -548,8 +605,8 @@ passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb) && ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE)) || (ccb->ccb_h.func_code == XPT_DEV_MATCH) || (ccb->ccb_h.func_code == XPT_SMP_IO) - || ((ccb->ccb_h.func_code == XPT_GDEV_ADVINFO) - && (ccb->cgdai.bufsiz > 0)))) { + || ((ccb->ccb_h.func_code == XPT_DEV_ADVINFO) + && (ccb->cdai.bufsiz > 0)))) { bzero(&mapinfo, sizeof(mapinfo)); diff --git a/sys/cam/scsi/scsi_xpt.c b/sys/cam/scsi/scsi_xpt.c index 2a38128..1b507ca 100644 --- a/sys/cam/scsi/scsi_xpt.c +++ b/sys/cam/scsi/scsi_xpt.c @@ -542,6 +542,7 @@ static const int scsi_quirk_table_size = static cam_status proberegister(struct cam_periph *periph, void *arg); static void probeschedule(struct cam_periph *probe_periph); +static int device_has_vpd(struct cam_ed *device, uint8_t page_id); static void probestart(struct cam_periph *periph, union ccb *start_ccb); static void proberequestdefaultnegotiation(struct cam_periph *periph); static int proberequestbackoff(struct cam_periph *periph, @@ -1460,7 +1461,7 @@ probedone(struct cam_periph *periph, union ccb *done_ccb) path->device->device_id = (uint8_t *)devid; } } else if (cam_periph_error(done_ccb, 0, - SF_RETRY_UA|SF_NO_PRINT, + SF_RETRY_UA, &softc->saved_ccb) == ERESTART) { return; } else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { @@ -1506,9 +1507,9 @@ probe_device_check: (u_int8_t *)malloc((serial_buf->length + 1), M_CAMXPT, M_NOWAIT); if (path->device->serial_num != NULL) { - bcopy(serial_buf->serial_num, - path->device->serial_num, - serial_buf->length); + memcpy(path->device->serial_num, + serial_buf->serial_num, + serial_buf->length); path->device->serial_num_len = serial_buf->length; path->device->serial_num[serial_buf->length] @@ -2433,28 +2434,77 @@ scsi_devise_transport(struct cam_path *path) } static void -scsi_getdev_advinfo(union ccb *start_ccb) +scsi_dev_advinfo(union ccb *start_ccb) { struct cam_ed *device; - struct ccb_getdev_advinfo *cgdai; + struct ccb_dev_advinfo *cdai; off_t amt; device = start_ccb->ccb_h.path->device; - cgdai = &start_ccb->cgdai; - switch(cgdai->buftype) { - case CGDAI_TYPE_SCSI_DEVID: - cgdai->provsiz = device->device_id_len; + cdai = &start_ccb->cdai; + switch(cdai->buftype) { + case CDAI_TYPE_SCSI_DEVID: + if (cdai->flags & CDAI_FLAG_STORE) + break; + cdai->provsiz = device->device_id_len; if (device->device_id_len == 0) break; amt = device->device_id_len; - if (cgdai->provsiz > cgdai->bufsiz) - amt = cgdai->bufsiz; - bcopy(device->device_id, cgdai->buf, amt); + if (cdai->provsiz > cdai->bufsiz) + amt = cdai->bufsiz; + memcpy(cdai->buf, device->device_id, amt); + break; + case CDAI_TYPE_SERIAL_NUM: + if (cdai->flags & CDAI_FLAG_STORE) + break; + cdai->provsiz = device->serial_num_len; + if (device->serial_num_len == 0) + break; + amt = device->serial_num_len; + if (cdai->provsiz > cdai->bufsiz) + amt = cdai->bufsiz; + memcpy(cdai->buf, device->serial_num, amt); + break; + case CDAI_TYPE_PHYS_PATH: + if (cdai->flags & CDAI_FLAG_STORE) { + if (device->physpath != NULL) + free(device->physpath, M_CAMXPT); + device->physpath_len = cdai->bufsiz; + /* Clear existing buffer if zero length */ + if (cdai->bufsiz == 0) + break; + device->physpath = malloc(cdai->bufsiz, M_CAMXPT, M_NOWAIT); + if (device->physpath == NULL) { + start_ccb->ccb_h.status = CAM_REQ_ABORTED; + return; + } + memcpy(device->physpath, cdai->buf, cdai->bufsiz); + } else { + cdai->provsiz = device->physpath_len; + if (device->physpath_len == 0) + break; + amt = device->physpath_len; + if (cdai->provsiz > cdai->bufsiz) + amt = cdai->bufsiz; + memcpy(cdai->buf, device->physpath, amt); + } break; default: break; } start_ccb->ccb_h.status = CAM_REQ_CMP; + + if (cdai->flags & CDAI_FLAG_STORE) { + int owned; + + owned = mtx_owned(start_ccb->ccb_h.path->bus->sim->mtx); + if (owned == 0) + mtx_lock(start_ccb->ccb_h.path->bus->sim->mtx); + xpt_async(AC_ADVINFO_CHANGED, start_ccb->ccb_h.path, + (void *)(uintptr_t)cdai->buftype); + if (owned == 0) + mtx_unlock(start_ccb->ccb_h.path->bus->sim->mtx); + } } static void @@ -2486,9 +2536,9 @@ scsi_action(union ccb *start_ccb) (*(sim->sim_action))(sim, start_ccb); break; } - case XPT_GDEV_ADVINFO: + case XPT_DEV_ADVINFO: { - scsi_getdev_advinfo(start_ccb); + scsi_dev_advinfo(start_ccb); break; } default: diff --git a/sys/conf/Makefile.arm b/sys/conf/Makefile.arm index 756945d..d099256 100644 --- a/sys/conf/Makefile.arm +++ b/sys/conf/Makefile.arm @@ -108,7 +108,8 @@ ${KERNEL_KO}.tramp: ${KERNEL_KO} $S/$M/$M/inckern.S $S/$M/$M/elf_trampoline.c gzip -9 ${KERNEL_KO}.tmp eval $$(stat -s ${KERNEL_KO}.tmp.gz) && \ echo "#define KERNCOMPSIZE $$st_size" >>opt_kernname.h - ${CC} -O2 -DKZIP -I. -I$S -c $S/kern/inflate.c -o inflate-tramp.o + ${CC} -O2 -ffreestanding -DKZIP -I. -I$S -c $S/kern/inflate.c -o \ + inflate-tramp.o ${CC} -O -nostdlib -I. -I$S -Xlinker -T -Xlinker ldscript.$M.tramp \ -DKZIP tmphack.S $S/$M/$M/elf_trampoline.c inflate-tramp.o \ $S/$M/$M/inckern.S ${FILES_CPU_FUNC} -o ${KERNEL_KO}.gz.tramp diff --git a/sys/dev/ata/chipsets/ata-intel.c b/sys/dev/ata/chipsets/ata-intel.c index 3b514db..e128051 100644 --- a/sys/dev/ata/chipsets/ata-intel.c +++ b/sys/dev/ata/chipsets/ata-intel.c @@ -288,7 +288,9 @@ ata_intel_chipinit(device_t dev) ATA_OUTL(ctlr->r_res2, 0x0C, ATA_INL(ctlr->r_res2, 0x0C) | 0xf); } - } else { + /* Skip BAR(5) on ICH8M Apples, system locks up on access. */ + } else if (ctlr->chip->chipid != ATA_I82801HBM_S1 || + pci_get_subvendor(dev) != 0x106b) { ctlr->r_type2 = SYS_RES_IOPORT; ctlr->r_rid2 = PCIR_BAR(5); ctlr->r_res2 = bus_alloc_resource_any(dev, ctlr->r_type2, diff --git a/sys/dev/ath/if_ath_ahb.c b/sys/dev/ath/if_ath_ahb.c index 33b8b92..a2bca05 100644 --- a/sys/dev/ath/if_ath_ahb.c +++ b/sys/dev/ath/if_ath_ahb.c @@ -123,7 +123,7 @@ ath_ahb_attach(device_t dev) device_printf(sc->sc_dev, "eeprom @ %p\n", (void *) eepromaddr); psc->sc_eeprom = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, (uintptr_t) eepromaddr, (uintptr_t) eepromaddr + (uintptr_t) ((ATH_EEPROM_DATA_SIZE * 2) - 1), 0, RF_ACTIVE); - if (psc->sc_sr == NULL) { + if (psc->sc_eeprom == NULL) { device_printf(dev, "cannot map eeprom space\n"); goto bad0; } @@ -139,6 +139,10 @@ ath_ahb_attach(device_t dev) /* Copy the EEPROM data out */ sc->sc_eepromdata = malloc(ATH_EEPROM_DATA_SIZE * 2, M_TEMP, M_NOWAIT | M_ZERO); + if (sc->sc_eepromdata == NULL) { + device_printf(dev, "cannot allocate memory for eeprom data\n"); + goto bad1; + } device_printf(sc->sc_dev, "eeprom data @ %p\n", (void *) rman_get_bushandle(psc->sc_eeprom)); /* XXX why doesn't this work? -adrian */ #if 0 diff --git a/sys/dev/puc/puc.c b/sys/dev/puc/puc.c index b6fa3c5..9bb3ceb 100644 --- a/sys/dev/puc/puc.c +++ b/sys/dev/puc/puc.c @@ -726,3 +726,41 @@ puc_bus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) } return (0); } + +int +puc_bus_print_child(device_t dev, device_t child) +{ + struct puc_port *port; + int retval; + + port = device_get_ivars(child); + retval = 0; + + retval += bus_print_child_header(dev, child); + retval += printf(" at port %d", port->p_nr); + retval += bus_print_child_footer(dev, child); + + return (retval); +} + +int +puc_bus_child_location_str(device_t dev, device_t child, char *buf, + size_t buflen) +{ + struct puc_port *port; + + port = device_get_ivars(child); + snprintf(buf, buflen, "port=%d", port->p_nr); + return (0); +} + +int +puc_bus_child_pnpinfo_str(device_t dev, device_t child, char *buf, + size_t buflen) +{ + struct puc_port *port; + + port = device_get_ivars(child); + snprintf(buf, buflen, "type=%d", port->p_type); + return (0); +} diff --git a/sys/dev/puc/puc_bfe.h b/sys/dev/puc/puc_bfe.h index f6d69c4..c67fab5 100644 --- a/sys/dev/puc/puc_bfe.h +++ b/sys/dev/puc/puc_bfe.h @@ -82,9 +82,12 @@ int puc_bfe_attach(device_t); int puc_bfe_detach(device_t); int puc_bfe_probe(device_t, const struct puc_cfg *); +int puc_bus_child_location_str(device_t, device_t, char *, size_t); +int puc_bus_child_pnpinfo_str(device_t, device_t, char *, size_t); struct resource *puc_bus_alloc_resource(device_t, device_t, int, int *, u_long, u_long, u_long, u_int); int puc_bus_get_resource(device_t, device_t, int, int, u_long *, u_long *); +int puc_bus_print_child(device_t, device_t); int puc_bus_read_ivar(device_t, device_t, int, uintptr_t *); int puc_bus_release_resource(device_t, device_t, int, int, struct resource *); int puc_bus_setup_intr(device_t, device_t, struct resource *, int, diff --git a/sys/dev/puc/puc_pccard.c b/sys/dev/puc/puc_pccard.c index 2cb9513..63d5787 100644 --- a/sys/dev/puc/puc_pccard.c +++ b/sys/dev/puc/puc_pccard.c @@ -82,7 +82,9 @@ static device_method_t puc_pccard_methods[] = { DEVMETHOD(bus_read_ivar, puc_bus_read_ivar), DEVMETHOD(bus_setup_intr, puc_bus_setup_intr), DEVMETHOD(bus_teardown_intr, puc_bus_teardown_intr), - DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_print_child, puc_bus_print_child), + DEVMETHOD(bus_child_pnpinfo_str, puc_bus_child_pnpinfo_str), + DEVMETHOD(bus_child_location_str, puc_bus_child_location_str), DEVMETHOD(bus_driver_added, bus_generic_driver_added), { 0, 0 } }; diff --git a/sys/dev/puc/puc_pci.c b/sys/dev/puc/puc_pci.c index 9a05b66..8c14717 100644 --- a/sys/dev/puc/puc_pci.c +++ b/sys/dev/puc/puc_pci.c @@ -132,7 +132,9 @@ static device_method_t puc_pci_methods[] = { DEVMETHOD(bus_read_ivar, puc_bus_read_ivar), DEVMETHOD(bus_setup_intr, puc_bus_setup_intr), DEVMETHOD(bus_teardown_intr, puc_bus_teardown_intr), - DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_print_child, puc_bus_print_child), + DEVMETHOD(bus_child_pnpinfo_str, puc_bus_child_pnpinfo_str), + DEVMETHOD(bus_child_location_str, puc_bus_child_location_str), DEVMETHOD(bus_driver_added, bus_generic_driver_added), { 0, 0 } }; diff --git a/sys/dev/safe/safe.c b/sys/dev/safe/safe.c index ac97098..18ef5e5 100644 --- a/sys/dev/safe/safe.c +++ b/sys/dev/safe/safe.c @@ -1580,9 +1580,12 @@ safe_callback(struct safe_softc *sc, struct safe_ringentry *re) * SHA-1 ICV's are byte-swapped; fix 'em up * before copy them to their destination. */ - bswap32(re->re_sastate.sa_saved_indigest[0]); - bswap32(re->re_sastate.sa_saved_indigest[1]); - bswap32(re->re_sastate.sa_saved_indigest[2]); + re->re_sastate.sa_saved_indigest[0] = + bswap32(re->re_sastate.sa_saved_indigest[0]); + re->re_sastate.sa_saved_indigest[1] = + bswap32(re->re_sastate.sa_saved_indigest[1]); + re->re_sastate.sa_saved_indigest[2] = + bswap32(re->re_sastate.sa_saved_indigest[2]); } crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, diff --git a/sys/dev/sound/pci/hda/hdac.c b/sys/dev/sound/pci/hda/hdac.c index 5bec624..bb0f385 100644 --- a/sys/dev/sound/pci/hda/hdac.c +++ b/sys/dev/sound/pci/hda/hdac.c @@ -754,7 +754,17 @@ static const struct { #define HDA_CODEC_CX20561 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5051) #define HDA_CODEC_CX20582 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5066) #define HDA_CODEC_CX20583 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5067) +#define HDA_CODEC_CX20584 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5068) #define HDA_CODEC_CX20585 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5069) +#define HDA_CODEC_CX20590 HDA_CODEC_CONSTRUCT(CONEXANT, 0x506e) +#define HDA_CODEC_CX20631 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5097) +#define HDA_CODEC_CX20632 HDA_CODEC_CONSTRUCT(CONEXANT, 0x5098) +#define HDA_CODEC_CX20641 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50a1) +#define HDA_CODEC_CX20642 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50a2) +#define HDA_CODEC_CX20651 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50ab) +#define HDA_CODEC_CX20652 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50ac) +#define HDA_CODEC_CX20664 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50b8) +#define HDA_CODEC_CX20665 HDA_CODEC_CONSTRUCT(CONEXANT, 0x50b9) #define HDA_CODEC_CXXXXX HDA_CODEC_CONSTRUCT(CONEXANT, 0xffff) /* VIA */ @@ -939,7 +949,17 @@ static const struct { { HDA_CODEC_CX20561, "Conexant CX20561 (Hermosa)" }, { HDA_CODEC_CX20582, "Conexant CX20582 (Pebble)" }, { HDA_CODEC_CX20583, "Conexant CX20583 (Pebble HSF)" }, + { HDA_CODEC_CX20584, "Conexant CX20584" }, { HDA_CODEC_CX20585, "Conexant CX20585" }, + { HDA_CODEC_CX20590, "Conexant CX20590" }, + { HDA_CODEC_CX20631, "Conexant CX20631" }, + { HDA_CODEC_CX20632, "Conexant CX20632" }, + { HDA_CODEC_CX20641, "Conexant CX20641" }, + { HDA_CODEC_CX20642, "Conexant CX20642" }, + { HDA_CODEC_CX20651, "Conexant CX20651" }, + { HDA_CODEC_CX20652, "Conexant CX20652" }, + { HDA_CODEC_CX20664, "Conexant CX20664" }, + { HDA_CODEC_CX20665, "Conexant CX20665" }, { HDA_CODEC_VT1708_8, "VIA VT1708_8" }, { HDA_CODEC_VT1708_9, "VIA VT1708_9" }, { HDA_CODEC_VT1708_A, "VIA VT1708_A" }, @@ -4126,7 +4146,10 @@ hdac_attach(device_t dev) uint16_t vendor; uint8_t v; - device_printf(dev, "HDA Driver Revision: %s\n", HDA_DRV_TEST_REV); + HDA_BOOTVERBOSE( + device_printf(dev, "HDA Driver Revision: %s\n", + HDA_DRV_TEST_REV); + ); model = (uint32_t)pci_get_device(dev) << 16; model |= (uint32_t)pci_get_vendor(dev) & 0x0000ffff; @@ -4921,6 +4944,25 @@ hdac_vendor_patch_parse(struct hdac_devinfo *devinfo) if (w != NULL) w->connsenable[0] = 0; break; + case HDA_CODEC_CX20582: + case HDA_CODEC_CX20583: + case HDA_CODEC_CX20584: + case HDA_CODEC_CX20585: + case HDA_CODEC_CX20590: + /* + * These codecs have extra connectivity on record side + * too reach for the present parser. + */ + w = hdac_widget_get(devinfo, 20); + if (w != NULL) + w->connsenable[1] = 0; + w = hdac_widget_get(devinfo, 21); + if (w != NULL) + w->connsenable[1] = 0; + w = hdac_widget_get(devinfo, 22); + if (w != NULL) + w->connsenable[0] = 0; + break; } } diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c index 8b412cf..e52c342 100644 --- a/sys/dev/xen/blkback/blkback.c +++ b/sys/dev/xen/blkback/blkback.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009-2010 Spectra Logic Corporation + * Copyright (c) 2009-2011 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -61,6 +61,8 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/vnode.h> #include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/bitstring.h> #include <geom/geom.h> @@ -153,9 +155,19 @@ MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); #define XBB_MAX_RING_PAGES \ BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ * XBB_MAX_REQUESTS) +/** + * The maximum number of ring pages that we can allow per request list. + * We limit this to the maximum number of segments per request, because + * that is already a reasonable number of segments to aggregate. This + * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, + * because that would leave situations where we can't dispatch even one + * large request. + */ +#define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST /*--------------------------- Forward Declarations ---------------------------*/ struct xbb_softc; +struct xbb_xen_req; static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); @@ -163,16 +175,15 @@ static int xbb_shutdown(struct xbb_softc *xbb); static int xbb_detach(device_t dev); /*------------------------------ Data Structures -----------------------------*/ -/** - * \brief Object tracking an in-flight I/O from a Xen VBD consumer. - */ -struct xbb_xen_req { - /** - * Linked list links used to aggregate idle request in the - * request free pool (xbb->request_free_slist). - */ - SLIST_ENTRY(xbb_xen_req) links; +STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); + +typedef enum { + XBB_REQLIST_NONE = 0x00, + XBB_REQLIST_MAPPED = 0x01 +} xbb_reqlist_flags; + +struct xbb_xen_reqlist { /** * Back reference to the parent block back instance for this * request. Used during bio_done handling. @@ -180,17 +191,71 @@ struct xbb_xen_req { struct xbb_softc *xbb; /** - * The remote domain's identifier for this I/O request. + * BLKIF_OP code for this request. + */ + int operation; + + /** + * Set to BLKIF_RSP_* to indicate request status. + * + * This field allows an error status to be recorded even if the + * delivery of this status must be deferred. Deferred reporting + * is necessary, for example, when an error is detected during + * completion processing of one bio when other bios for this + * request are still outstanding. + */ + int status; + + /** + * Number of 512 byte sectors not transferred. + */ + int residual_512b_sectors; + + /** + * Starting sector number of the first request in the list. + */ + off_t starting_sector_number; + + /** + * If we're going to coalesce, the next contiguous sector would be + * this one. + */ + off_t next_contig_sector; + + /** + * Number of child requests in the list. + */ + int num_children; + + /** + * Number of I/O requests dispatched to the backend. + */ + int pendcnt; + + /** + * Total number of segments for requests in the list. + */ + int nr_segments; + + /** + * Flags for this particular request list. */ - uint64_t id; + xbb_reqlist_flags flags; /** * Kernel virtual address space reserved for this request - * structure and used to map the remote domain's pages for + * list structure and used to map the remote domain's pages for * this I/O, into our domain's address space. */ uint8_t *kva; + /** + * Base, psuedo-physical address, corresponding to the start + * of this request's kva region. + */ + uint64_t gnt_base; + + #ifdef XBB_USE_BOUNCE_BUFFERS /** * Pre-allocated domain local memory used to proxy remote @@ -200,53 +265,91 @@ struct xbb_xen_req { #endif /** - * Base, psuedo-physical address, corresponding to the start - * of this request's kva region. + * Array of grant handles (one per page) used to map this request. */ - uint64_t gnt_base; + grant_handle_t *gnt_handles; + + /** + * Device statistics request ordering type (ordered or simple). + */ + devstat_tag_type ds_tag_type; + + /** + * Device statistics request type (read, write, no_data). + */ + devstat_trans_flags ds_trans_type; + + /** + * The start time for this request. + */ + struct bintime ds_t0; + + /** + * Linked list of contiguous requests with the same operation type. + */ + struct xbb_xen_req_list contig_req_list; + + /** + * Linked list links used to aggregate idle requests in the + * request list free pool (xbb->reqlist_free_stailq) and pending + * requests waiting for execution (xbb->reqlist_pending_stailq). + */ + STAILQ_ENTRY(xbb_xen_reqlist) links; +}; + +STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); + +/** + * \brief Object tracking an in-flight I/O from a Xen VBD consumer. + */ +struct xbb_xen_req { + /** + * Linked list links used to aggregate requests into a reqlist + * and to store them in the request free pool. + */ + STAILQ_ENTRY(xbb_xen_req) links; + + /** + * The remote domain's identifier for this I/O request. + */ + uint64_t id; /** * The number of pages currently mapped for this request. */ - int nr_pages; + int nr_pages; /** * The number of 512 byte sectors comprising this requests. */ - int nr_512b_sectors; + int nr_512b_sectors; /** * The number of struct bio requests still outstanding for this * request on the backend device. This field is only used for * device (rather than file) backed I/O. */ - int pendcnt; + int pendcnt; /** * BLKIF_OP code for this request. */ - int operation; + int operation; /** - * BLKIF_RSP status code for this request. - * - * This field allows an error status to be recorded even if the - * delivery of this status must be deferred. Deferred reporting - * is necessary, for example, when an error is detected during - * completion processing of one bio when other bios for this - * request are still outstanding. + * Storage used for non-native ring requests. */ - int status; + blkif_request_t ring_req_storage; /** - * Device statistics request ordering type (ordered or simple). + * Pointer to the Xen request in the ring. */ - devstat_tag_type ds_tag_type; + blkif_request_t *ring_req; /** - * Device statistics request type (read, write, no_data). + * Consumer index for this request. */ - devstat_trans_flags ds_trans_type; + RING_IDX req_ring_idx; /** * The start time for this request. @@ -254,9 +357,9 @@ struct xbb_xen_req { struct bintime ds_t0; /** - * Array of grant handles (one per page) used to map this request. + * Pointer back to our parent request list. */ - grant_handle_t *gnt_handles; + struct xbb_xen_reqlist *reqlist; }; SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); @@ -321,7 +424,10 @@ typedef enum XBBF_RESOURCE_SHORTAGE = 0x04, /** Connection teardown in progress. */ - XBBF_SHUTDOWN = 0x08 + XBBF_SHUTDOWN = 0x08, + + /** A thread is already performing shutdown processing. */ + XBBF_IN_SHUTDOWN = 0x10 } xbb_flag_t; /** Backend device type. */ @@ -399,7 +505,7 @@ struct xbb_file_data { * Only a single file based request is outstanding per-xbb instance, * so we only need one of these. */ - struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; #ifdef XBB_USE_BOUNCE_BUFFERS /** @@ -411,7 +517,7 @@ struct xbb_file_data { * bounce-out the read data. This array serves as the temporary * storage for this saved data. */ - struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * \brief Array of memoized bounce buffer kva offsets used @@ -422,7 +528,7 @@ struct xbb_file_data { * the request sg elements is unavoidable. We memoize the computed * bounce address here to reduce the cost of the second walk. */ - void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST]; + void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; #endif /* XBB_USE_BOUNCE_BUFFERS */ }; @@ -437,9 +543,9 @@ union xbb_backend_data { /** * Function signature of backend specific I/O handlers. */ -typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req, - struct xbb_xen_req *req, int nseg, - int operation, int flags); +typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, + struct xbb_xen_reqlist *reqlist, int operation, + int flags); /** * Per-instance configuration data. @@ -467,14 +573,23 @@ struct xbb_softc { xbb_dispatch_t dispatch_io; /** The number of requests outstanding on the backend device/file. */ - u_int active_request_count; + int active_request_count; /** Free pool of request tracking structures. */ - struct xbb_xen_req_slist request_free_slist; + struct xbb_xen_req_list request_free_stailq; /** Array, sized at connection time, of request tracking structures. */ struct xbb_xen_req *requests; + /** Free pool of request list structures. */ + struct xbb_xen_reqlist_list reqlist_free_stailq; + + /** List of pending request lists awaiting execution. */ + struct xbb_xen_reqlist_list reqlist_pending_stailq; + + /** Array, sized at connection time, of request list structures. */ + struct xbb_xen_reqlist *request_lists; + /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. @@ -487,6 +602,15 @@ struct xbb_softc { /** The size of the global kva pool. */ int kva_size; + /** The size of the KVA area used for request lists. */ + int reqlist_kva_size; + + /** The number of pages of KVA used for request lists */ + int reqlist_kva_pages; + + /** Bitmap of free KVA pages */ + bitstr_t *kva_free; + /** * \brief Cached value of the front-end's domain id. * @@ -508,12 +632,12 @@ struct xbb_softc { int abi; /** - * \brief The maximum number of requests allowed to be in - * flight at a time. + * \brief The maximum number of requests and request lists allowed + * to be in flight at a time. * * This value is negotiated via the XenStore. */ - uint32_t max_requests; + u_int max_requests; /** * \brief The maximum number of segments (1 page per segment) @@ -521,7 +645,15 @@ struct xbb_softc { * * This value is negotiated via the XenStore. */ - uint32_t max_request_segments; + u_int max_request_segments; + + /** + * \brief Maximum number of segments per request list. + * + * This value is derived from and will generally be larger than + * max_request_segments. + */ + u_int max_reqlist_segments; /** * The maximum size of any request to this back-end @@ -529,7 +661,13 @@ struct xbb_softc { * * This value is negotiated via the XenStore. */ - uint32_t max_request_size; + u_int max_request_size; + + /** + * The maximum size of any request list. This is derived directly + * from max_reqlist_segments. + */ + u_int max_reqlist_size; /** Various configuration and state bit flags. */ xbb_flag_t flags; @@ -574,6 +712,7 @@ struct xbb_softc { struct vnode *vn; union xbb_backend_data backend; + /** The native sector size of the backend. */ u_int sector_size; @@ -598,7 +737,14 @@ struct xbb_softc { * * Ring processing is serialized so we only need one of these. */ - struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; + + /** + * Temporary grant table map used in xbb_dispatch_io(). When + * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the + * stack could cause a stack overflow. + */ + struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; /** Mutex protecting per-instance data. */ struct mtx lock; @@ -614,8 +760,51 @@ struct xbb_softc { int pseudo_phys_res_id; #endif - /** I/O statistics. */ + /** + * I/O statistics from BlockBack dispatch down. These are + * coalesced requests, and we start them right before execution. + */ struct devstat *xbb_stats; + + /** + * I/O statistics coming into BlockBack. These are the requests as + * we get them from BlockFront. They are started as soon as we + * receive a request, and completed when the I/O is complete. + */ + struct devstat *xbb_stats_in; + + /** Disable sending flush to the backend */ + int disable_flush; + + /** Send a real flush for every N flush requests */ + int flush_interval; + + /** Count of flush requests in the interval */ + int flush_count; + + /** Don't coalesce requests if this is set */ + int no_coalesce_reqs; + + /** Number of requests we have received */ + uint64_t reqs_received; + + /** Number of requests we have completed*/ + uint64_t reqs_completed; + + /** How many forced dispatches (i.e. without coalescing) have happend */ + uint64_t forced_dispatch; + + /** How many normal dispatches have happend */ + uint64_t normal_dispatch; + + /** How many total dispatches have happend */ + uint64_t total_dispatch; + + /** How many times we have run out of KVA */ + uint64_t kva_shortages; + + /** How many times we have run out of request structures */ + uint64_t request_shortages; }; /*---------------------------- Request Processing ----------------------------*/ @@ -633,21 +822,14 @@ xbb_get_req(struct xbb_softc *xbb) struct xbb_xen_req *req; req = NULL; - mtx_lock(&xbb->lock); - /* - * Do not allow new requests to be allocated while we - * are shutting down. - */ - if ((xbb->flags & XBBF_SHUTDOWN) == 0) { - if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) { - SLIST_REMOVE_HEAD(&xbb->request_free_slist, links); - xbb->active_request_count++; - } else { - xbb->flags |= XBBF_RESOURCE_SHORTAGE; - } + mtx_assert(&xbb->lock, MA_OWNED); + + if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { + STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); + xbb->active_request_count++; } - mtx_unlock(&xbb->lock); + return (req); } @@ -660,34 +842,40 @@ xbb_get_req(struct xbb_softc *xbb) static inline void xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { - int wake_thread; + mtx_assert(&xbb->lock, MA_OWNED); - mtx_lock(&xbb->lock); - wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE; - xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; - SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); + STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); xbb->active_request_count--; - if ((xbb->flags & XBBF_SHUTDOWN) != 0) { - /* - * Shutdown is in progress. See if we can - * progress further now that one more request - * has completed and been returned to the - * free pool. - */ - xbb_shutdown(xbb); - } - mtx_unlock(&xbb->lock); + KASSERT(xbb->active_request_count >= 0, + ("xbb_release_req: negative active count")); +} - if (wake_thread != 0) - taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); +/** + * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * \param req_list The list of requests to free. + * \param nreqs The number of items in the list. + */ +static inline void +xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, + int nreqs) +{ + mtx_assert(&xbb->lock, MA_OWNED); + + STAILQ_CONCAT(&xbb->request_free_stailq, req_list); + xbb->active_request_count -= nreqs; + + KASSERT(xbb->active_request_count >= 0, + ("xbb_release_reqs: negative active count")); } /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's kva region. * - * \param req The request structure whose kva region will be accessed. + * \param reqlist The request structure whose kva region will be accessed. * \param pagenr The page index used to compute the kva offset. * \param sector The 512b sector index used to compute the page relative * kva offset. @@ -695,9 +883,9 @@ xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) * \return The computed global KVA offset. */ static inline uint8_t * -xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) +xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { - return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9)); + return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } #ifdef XBB_USE_BOUNCE_BUFFERS @@ -705,7 +893,7 @@ xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) * Given a page index and 512b sector offset within that page, * calculate an offset into a request's local bounce memory region. * - * \param req The request structure whose bounce region will be accessed. + * \param reqlist The request structure whose bounce region will be accessed. * \param pagenr The page index used to compute the bounce offset. * \param sector The 512b sector index used to compute the page relative * bounce offset. @@ -713,9 +901,9 @@ xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) * \return The computed global bounce buffer address. */ static inline uint8_t * -xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) +xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { - return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); + return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } #endif @@ -724,7 +912,7 @@ xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) * calculate an offset into the request's memory region that the * underlying backend device/file should use for I/O. * - * \param req The request structure whose I/O region will be accessed. + * \param reqlist The request structure whose I/O region will be accessed. * \param pagenr The page index used to compute the I/O offset. * \param sector The 512b sector index used to compute the page relative * I/O offset. @@ -736,12 +924,12 @@ xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) * this request. */ static inline uint8_t * -xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) +xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { #ifdef XBB_USE_BOUNCE_BUFFERS - return (xbb_req_bounce_addr(req, pagenr, sector)); + return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); #else - return (xbb_req_vaddr(req, pagenr, sector)); + return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); #endif } @@ -750,7 +938,7 @@ xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) * an offset into the local psuedo-physical address space used to map a * front-end's request data into a request. * - * \param req The request structure whose pseudo-physical region + * \param reqlist The request list structure whose pseudo-physical region * will be accessed. * \param pagenr The page index used to compute the pseudo-physical offset. * \param sector The 512b sector index used to compute the page relative @@ -763,10 +951,126 @@ xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) * this request. */ static inline uintptr_t -xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector) +xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) +{ + struct xbb_softc *xbb; + + xbb = reqlist->xbb; + + return ((uintptr_t)(xbb->gnt_base_addr + + (uintptr_t)(reqlist->kva - xbb->kva) + + (PAGE_SIZE * pagenr) + (sector << 9))); +} + +/** + * Get Kernel Virtual Address space for mapping requests. + * + * \param xbb Per-instance xbb configuration structure. + * \param nr_pages Number of pages needed. + * \param check_only If set, check for free KVA but don't allocate it. + * \param have_lock If set, xbb lock is already held. + * + * \return On success, a pointer to the allocated KVA region. Otherwise NULL. + * + * Note: This should be unnecessary once we have either chaining or + * scatter/gather support for struct bio. At that point we'll be able to + * put multiple addresses and lengths in one bio/bio chain and won't need + * to map everything into one virtual segment. + */ +static uint8_t * +xbb_get_kva(struct xbb_softc *xbb, int nr_pages) +{ + intptr_t first_clear, num_clear; + uint8_t *free_kva; + int i; + + KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); + + first_clear = 0; + free_kva = NULL; + + mtx_lock(&xbb->lock); + + /* + * Look for the first available page. If there are none, we're done. + */ + bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); + + if (first_clear == -1) + goto bailout; + + /* + * Starting at the first available page, look for consecutive free + * pages that will satisfy the user's request. + */ + for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { + /* + * If this is true, the page is used, so we have to reset + * the number of clear pages and the first clear page + * (since it pointed to a region with an insufficient number + * of clear pages). + */ + if (bit_test(xbb->kva_free, i)) { + num_clear = 0; + first_clear = -1; + continue; + } + + if (first_clear == -1) + first_clear = i; + + /* + * If this is true, we've found a large enough free region + * to satisfy the request. + */ + if (++num_clear == nr_pages) { + + bit_nset(xbb->kva_free, first_clear, + first_clear + nr_pages - 1); + + free_kva = xbb->kva + + (uint8_t *)(first_clear * PAGE_SIZE); + + KASSERT(free_kva >= (uint8_t *)xbb->kva && + free_kva + (nr_pages * PAGE_SIZE) <= + (uint8_t *)xbb->ring_config.va, + ("Free KVA %p len %d out of range, " + "kva = %#jx, ring VA = %#jx\n", free_kva, + nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, + (uintmax_t)xbb->ring_config.va)); + break; + } + } + +bailout: + + if (free_kva == NULL) { + xbb->flags |= XBBF_RESOURCE_SHORTAGE; + xbb->kva_shortages++; + } + + mtx_unlock(&xbb->lock); + + return (free_kva); +} + +/** + * Free allocated KVA. + * + * \param xbb Per-instance xbb configuration structure. + * \param kva_ptr Pointer to allocated KVA region. + * \param nr_pages Number of pages in the KVA region. + */ +static void +xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) { - return ((uintptr_t)(req->gnt_base - + (PAGE_SIZE * pagenr) + (sector << 9))); + intptr_t start_page; + + mtx_assert(&xbb->lock, MA_OWNED); + + start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; + bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); + } /** @@ -775,23 +1079,23 @@ xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector) * \param req The request structure to unmap. */ static void -xbb_unmap_req(struct xbb_xen_req *req) +xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) { - struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; u_int i; u_int invcount; int error; invcount = 0; - for (i = 0; i < req->nr_pages; i++) { + for (i = 0; i < reqlist->nr_segments; i++) { - if (req->gnt_handles[i] == GRANT_REF_INVALID) + if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) continue; - unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0); + unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); unmap[invcount].dev_bus_addr = 0; - unmap[invcount].handle = req->gnt_handles[i]; - req->gnt_handles[i] = GRANT_REF_INVALID; + unmap[invcount].handle = reqlist->gnt_handles[i]; + reqlist->gnt_handles[i] = GRANT_REF_INVALID; invcount++; } @@ -801,6 +1105,175 @@ xbb_unmap_req(struct xbb_xen_req *req) } /** + * Allocate an internal transaction tracking structure from the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return On success, a pointer to the allocated xbb_xen_reqlist structure. + * Otherwise NULL. + */ +static inline struct xbb_xen_reqlist * +xbb_get_reqlist(struct xbb_softc *xbb) +{ + struct xbb_xen_reqlist *reqlist; + + reqlist = NULL; + + mtx_assert(&xbb->lock, MA_OWNED); + + if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { + + STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); + reqlist->flags = XBB_REQLIST_NONE; + reqlist->kva = NULL; + reqlist->status = BLKIF_RSP_OKAY; + reqlist->residual_512b_sectors = 0; + reqlist->num_children = 0; + reqlist->nr_segments = 0; + STAILQ_INIT(&reqlist->contig_req_list); + } + + return (reqlist); +} + +/** + * Return an allocated transaction tracking structure to the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * \param req The request list structure to free. + * \param wakeup If set, wakeup the work thread if freeing this reqlist + * during a resource shortage condition. + */ +static inline void +xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, + int wakeup) +{ + + mtx_lock(&xbb->lock); + + if (wakeup) { + wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; + xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; + } + + if (reqlist->kva != NULL) + xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); + + xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); + + STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); + + if ((xbb->flags & XBBF_SHUTDOWN) != 0) { + /* + * Shutdown is in progress. See if we can + * progress further now that one more request + * has completed and been returned to the + * free pool. + */ + xbb_shutdown(xbb); + } + + mtx_unlock(&xbb->lock); + + if (wakeup != 0) + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); +} + +/** + * Request resources and do basic request setup. + * + * \param xbb Per-instance xbb configuration structure. + * \param reqlist Pointer to reqlist pointer. + * \param ring_req Pointer to a block ring request. + * \param ring_index The ring index of this request. + * + * \return 0 for success, non-zero for failure. + */ +static int +xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, + blkif_request_t *ring_req, RING_IDX ring_idx) +{ + struct xbb_xen_reqlist *nreqlist; + struct xbb_xen_req *nreq; + + nreqlist = NULL; + nreq = NULL; + + mtx_lock(&xbb->lock); + + /* + * We don't allow new resources to be allocated if we're in the + * process of shutting down. + */ + if ((xbb->flags & XBBF_SHUTDOWN) != 0) { + mtx_unlock(&xbb->lock); + return (1); + } + + /* + * Allocate a reqlist if the caller doesn't have one already. + */ + if (*reqlist == NULL) { + nreqlist = xbb_get_reqlist(xbb); + if (nreqlist == NULL) + goto bailout_error; + } + + /* We always allocate a request. */ + nreq = xbb_get_req(xbb); + if (nreq == NULL) + goto bailout_error; + + mtx_unlock(&xbb->lock); + + if (*reqlist == NULL) { + *reqlist = nreqlist; + nreqlist->operation = ring_req->operation; + nreqlist->starting_sector_number = ring_req->sector_number; + STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, + links); + } + + nreq->reqlist = *reqlist; + nreq->req_ring_idx = ring_idx; + + if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { + bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); + nreq->ring_req = &nreq->ring_req_storage; + } else { + nreq->ring_req = ring_req; + } + + binuptime(&nreq->ds_t0); + devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); + STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); + (*reqlist)->num_children++; + (*reqlist)->nr_segments += ring_req->nr_segments; + + return (0); + +bailout_error: + + /* + * We're out of resources, so set the shortage flag. The next time + * a request is released, we'll try waking up the work thread to + * see if we can allocate more resources. + */ + xbb->flags |= XBBF_RESOURCE_SHORTAGE; + xbb->request_shortages++; + + if (nreq != NULL) + xbb_release_req(xbb, nreq); + + mtx_unlock(&xbb->lock); + + if (nreqlist != NULL) + xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); + + return (1); +} + +/** * Create and transmit a response to a blkif request. * * \param xbb Per-instance xbb configuration structure. @@ -862,6 +1335,8 @@ xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) more_to_do = 1; } + xbb->reqs_completed++; + mtx_unlock(&xbb->lock); if (more_to_do) @@ -872,6 +1347,70 @@ xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) } /** + * Complete a request list. + * + * \param xbb Per-instance xbb configuration structure. + * \param reqlist Allocated internal request list structure. + */ +static void +xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) +{ + struct xbb_xen_req *nreq; + off_t sectors_sent; + + sectors_sent = 0; + + if (reqlist->flags & XBB_REQLIST_MAPPED) + xbb_unmap_reqlist(reqlist); + + /* + * All I/O is done, send the response. A lock should not be + * necessary here because the request list is complete, and + * therefore this is the only context accessing this request + * right now. The functions we call do their own locking if + * necessary. + */ + STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { + off_t cur_sectors_sent; + + xbb_send_response(xbb, nreq, reqlist->status); + + /* We don't report bytes sent if there is an error. */ + if (reqlist->status == BLKIF_RSP_OKAY) + cur_sectors_sent = nreq->nr_512b_sectors; + else + cur_sectors_sent = 0; + + sectors_sent += cur_sectors_sent; + + devstat_end_transaction(xbb->xbb_stats_in, + /*bytes*/cur_sectors_sent << 9, + reqlist->ds_tag_type, + reqlist->ds_trans_type, + /*now*/NULL, + /*then*/&nreq->ds_t0); + } + + /* + * Take out any sectors not sent. If we wind up negative (which + * might happen if an error is reported as well as a residual), just + * report 0 sectors sent. + */ + sectors_sent -= reqlist->residual_512b_sectors; + if (sectors_sent < 0) + sectors_sent = 0; + + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/ sectors_sent << 9, + reqlist->ds_tag_type, + reqlist->ds_trans_type, + /*now*/NULL, + /*then*/&reqlist->ds_t0); + + xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); +} + +/** * Completion handler for buffer I/O requests issued by the device * backend driver. * @@ -881,18 +1420,34 @@ xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) static void xbb_bio_done(struct bio *bio) { - struct xbb_softc *xbb; - struct xbb_xen_req *req; + struct xbb_softc *xbb; + struct xbb_xen_reqlist *reqlist; + + reqlist = bio->bio_caller1; + xbb = reqlist->xbb; - req = bio->bio_caller1; - xbb = req->xbb; + reqlist->residual_512b_sectors += bio->bio_resid >> 9; - /* Only include transferred I/O in stats. */ - req->nr_512b_sectors -= bio->bio_resid >> 9; + /* + * This is a bit imprecise. With aggregated I/O a single + * request list can contain multiple front-end requests and + * a multiple bios may point to a single request. By carefully + * walking the request list, we could map residuals and errors + * back to the original front-end request, but the interface + * isn't sufficiently rich for us to properly report the error. + * So, we just treat the entire request list as having failed if an + * error occurs on any part. And, if an error occurs, we treat + * the amount of data transferred as 0. + * + * For residuals, we report it on the overall aggregated device, + * but not on the individual requests, since we don't currently + * do the work to determine which front-end request to which the + * residual applies. + */ if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", bio->bio_error, xbb->dev_name); - req->status = BLKIF_RSP_ERROR; + reqlist->status = BLKIF_RSP_ERROR; if (bio->bio_error == ENXIO && xenbus_get_state(xbb->dev) == XenbusStateConnected) { @@ -911,23 +1466,18 @@ xbb_bio_done(struct bio *bio) vm_offset_t kva_offset; kva_offset = (vm_offset_t)bio->bio_data - - (vm_offset_t)req->bounce; - memcpy((uint8_t *)req->kva + kva_offset, + - (vm_offset_t)reqlist->bounce; + memcpy((uint8_t *)reqlist->kva + kva_offset, bio->bio_data, bio->bio_bcount); } #endif /* XBB_USE_BOUNCE_BUFFERS */ - if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) { - xbb_unmap_req(req); - xbb_send_response(xbb, req, req->status); - devstat_end_transaction(xbb->xbb_stats, - /*bytes*/req->nr_512b_sectors << 9, - req->ds_tag_type, - req->ds_trans_type, - /*now*/NULL, - /*then*/&req->ds_t0); - xbb_release_req(xbb, req); - } + /* + * Decrement the pending count for the request list. When we're + * done with the requests, send status back for all of them. + */ + if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) + xbb_complete_reqlist(xbb, reqlist); g_destroy_bio(bio); } @@ -936,228 +1486,315 @@ xbb_bio_done(struct bio *bio) * Parse a blkif request into an internal request structure and send * it to the backend for processing. * - * \param xbb Per-instance xbb configuration structure. - * \param ring_req Front-end's I/O request as pulled from the shared - * communication ring. - * \param req Allocated internal request structure. - * \param req_ring_idx The location of ring_req within the shared - * communication ring. + * \param xbb Per-instance xbb configuration structure. + * \param reqlist Allocated internal request list structure. * + * \return On success, 0. For resource shortages, non-zero. + * * This routine performs the backend common aspects of request parsing * including compiling an internal request structure, parsing the S/G * list and any secondary ring requests in which they may reside, and * the mapping of front-end I/O pages into our domain. */ -static void -xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req, - struct xbb_xen_req *req, RING_IDX req_ring_idx) +static int +xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { - struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST]; struct xbb_sg *xbb_sg; struct gnttab_map_grant_ref *map; struct blkif_request_segment *sg; struct blkif_request_segment *last_block_sg; + struct xbb_xen_req *nreq; u_int nseg; u_int seg_idx; u_int block_segs; int nr_sects; + int total_sects; int operation; uint8_t bio_flags; int error; - nseg = ring_req->nr_segments; - nr_sects = 0; - req->xbb = xbb; - req->id = ring_req->id; - req->operation = ring_req->operation; - req->status = BLKIF_RSP_OKAY; - req->ds_tag_type = DEVSTAT_TAG_SIMPLE; - req->nr_pages = nseg; - req->nr_512b_sectors = 0; + reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; bio_flags = 0; - sg = NULL; + total_sects = 0; + nr_sects = 0; - binuptime(&req->ds_t0); - devstat_start_transaction(xbb->xbb_stats, &req->ds_t0); + /* + * First determine whether we have enough free KVA to satisfy this + * request list. If not, tell xbb_run_queue() so it can go to + * sleep until we have more KVA. + */ + reqlist->kva = NULL; + if (reqlist->nr_segments != 0) { + reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); + if (reqlist->kva == NULL) { + /* + * If we're out of KVA, return ENOMEM. + */ + return (ENOMEM); + } + } + + binuptime(&reqlist->ds_t0); + devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); - switch (req->operation) { + switch (reqlist->operation) { case BLKIF_OP_WRITE_BARRIER: bio_flags |= BIO_ORDERED; - req->ds_tag_type = DEVSTAT_TAG_ORDERED; + reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; /* FALLTHROUGH */ case BLKIF_OP_WRITE: operation = BIO_WRITE; - req->ds_trans_type = DEVSTAT_WRITE; + reqlist->ds_trans_type = DEVSTAT_WRITE; if ((xbb->flags & XBBF_READ_ONLY) != 0) { DPRINTF("Attempt to write to read only device %s\n", xbb->dev_name); - goto fail_send_response; + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } break; case BLKIF_OP_READ: operation = BIO_READ; - req->ds_trans_type = DEVSTAT_READ; + reqlist->ds_trans_type = DEVSTAT_READ; break; case BLKIF_OP_FLUSH_DISKCACHE: + /* + * If this is true, the user has requested that we disable + * flush support. So we just complete the requests + * successfully. + */ + if (xbb->disable_flush != 0) { + goto send_response; + } + + /* + * The user has requested that we only send a real flush + * for every N flush requests. So keep count, and either + * complete the request immediately or queue it for the + * backend. + */ + if (xbb->flush_interval != 0) { + if (++(xbb->flush_count) < xbb->flush_interval) { + goto send_response; + } else + xbb->flush_count = 0; + } + operation = BIO_FLUSH; - req->ds_tag_type = DEVSTAT_TAG_ORDERED; - req->ds_trans_type = DEVSTAT_NO_DATA; + reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; + reqlist->ds_trans_type = DEVSTAT_NO_DATA; goto do_dispatch; /*NOTREACHED*/ default: DPRINTF("error: unknown block io operation [%d]\n", - req->operation); - goto fail_send_response; + reqlist->operation); + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } - /* Check that number of segments is sane. */ - if (unlikely(nseg == 0) - || unlikely(nseg > xbb->max_request_segments)) { - DPRINTF("Bad number of segments in request (%d)\n", nseg); - goto fail_send_response; - } - - map = maps; + reqlist->xbb = xbb; xbb_sg = xbb->xbb_sgs; - block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); - sg = ring_req->seg; - last_block_sg = sg + block_segs; + map = xbb->maps; seg_idx = 0; - while (1) { - while (sg < last_block_sg) { - - xbb_sg->first_sect = sg->first_sect; - xbb_sg->last_sect = sg->last_sect; - xbb_sg->nsect = - (int8_t)(sg->last_sect - sg->first_sect + 1); - - if ((sg->last_sect >= (PAGE_SIZE >> 9)) - || (xbb_sg->nsect <= 0)) - goto fail_send_response; - - nr_sects += xbb_sg->nsect; - map->host_addr = xbb_req_gntaddr(req, seg_idx, - /*sector*/0); - map->flags = GNTMAP_host_map; - map->ref = sg->gref; - map->dom = xbb->otherend_id; - if (operation == BIO_WRITE) - map->flags |= GNTMAP_readonly; - sg++; - map++; - xbb_sg++; - seg_idx++; + STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { + blkif_request_t *ring_req; + RING_IDX req_ring_idx; + u_int req_seg_idx; + + ring_req = nreq->ring_req; + req_ring_idx = nreq->req_ring_idx; + nr_sects = 0; + nseg = ring_req->nr_segments; + nreq->id = ring_req->id; + nreq->nr_pages = nseg; + nreq->nr_512b_sectors = 0; + req_seg_idx = 0; + sg = NULL; + + /* Check that number of segments is sane. */ + if (unlikely(nseg == 0) + || unlikely(nseg > xbb->max_request_segments)) { + DPRINTF("Bad number of segments in request (%d)\n", + nseg); + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } - block_segs = MIN(nseg - seg_idx, - BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); - if (block_segs == 0) - break; - - /* - * Fetch the next request block full of SG elements. - * For now, only the spacing between entries is different - * in the different ABIs, not the sg entry layout. - */ - req_ring_idx++; - switch (xbb->abi) { - case BLKIF_PROTOCOL_NATIVE: - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, - req_ring_idx); - break; - case BLKIF_PROTOCOL_X86_32: - { - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, - req_ring_idx); - break; - } - case BLKIF_PROTOCOL_X86_64: - { - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, - req_ring_idx); - break; - } - default: - panic("Unexpected blkif protocol ABI."); - /* NOTREACHED */ - } + block_segs = MIN(nreq->nr_pages, + BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); + sg = ring_req->seg; last_block_sg = sg + block_segs; - } + while (1) { + + while (sg < last_block_sg) { + KASSERT(seg_idx < + XBB_MAX_SEGMENTS_PER_REQLIST, + ("seg_idx %d is too large, max " + "segs %d\n", seg_idx, + XBB_MAX_SEGMENTS_PER_REQLIST)); + + xbb_sg->first_sect = sg->first_sect; + xbb_sg->last_sect = sg->last_sect; + xbb_sg->nsect = + (int8_t)(sg->last_sect - + sg->first_sect + 1); + + if ((sg->last_sect >= (PAGE_SIZE >> 9)) + || (xbb_sg->nsect <= 0)) { + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; + } + + nr_sects += xbb_sg->nsect; + map->host_addr = xbb_get_gntaddr(reqlist, + seg_idx, /*sector*/0); + KASSERT(map->host_addr + PAGE_SIZE <= + xbb->ring_config.gnt_addr, + ("Host address %#jx len %d overlaps " + "ring address %#jx\n", + (uintmax_t)map->host_addr, PAGE_SIZE, + (uintmax_t)xbb->ring_config.gnt_addr)); + + map->flags = GNTMAP_host_map; + map->ref = sg->gref; + map->dom = xbb->otherend_id; + if (operation == BIO_WRITE) + map->flags |= GNTMAP_readonly; + sg++; + map++; + xbb_sg++; + seg_idx++; + req_seg_idx++; + } + + block_segs = MIN(nseg - req_seg_idx, + BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); + if (block_segs == 0) + break; - /* Convert to the disk's sector size */ - req->nr_512b_sectors = nr_sects; - nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; + /* + * Fetch the next request block full of SG elements. + * For now, only the spacing between entries is + * different in the different ABIs, not the sg entry + * layout. + */ + req_ring_idx++; + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, + req_ring_idx); + break; + case BLKIF_PROTOCOL_X86_32: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, + req_ring_idx); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, + req_ring_idx); + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } + last_block_sg = sg + block_segs; + } - if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { - device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple " - "of the backing store sector size (%d)\n", - __func__, req->nr_512b_sectors << 9, - xbb->sector_size); - goto fail_send_response; + /* Convert to the disk's sector size */ + nreq->nr_512b_sectors = nr_sects; + nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; + total_sects += nr_sects; + + if ((nreq->nr_512b_sectors & + ((xbb->sector_size >> 9) - 1)) != 0) { + device_printf(xbb->dev, "%s: I/O size (%d) is not " + "a multiple of the backing store sector " + "size (%d)\n", __func__, + nreq->nr_512b_sectors << 9, + xbb->sector_size); + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; + } } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, - maps, req->nr_pages); + xbb->maps, reqlist->nr_segments); if (error != 0) panic("Grant table operation failed (%d)", error); - for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) { + reqlist->flags |= XBB_REQLIST_MAPPED; + + for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; + seg_idx++, map++){ if (unlikely(map->status != 0)) { - DPRINTF("invalid buffer -- could not remap it (%d)\n", - map->status); - DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x " - "ref 0x%x, dom %d\n", seg_idx, + DPRINTF("invalid buffer -- could not remap " + "it (%d)\n", map->status); + DPRINTF("Mapping(%d): Host Addr 0x%lx, flags " + "0x%x ref 0x%x, dom %d\n", seg_idx, map->host_addr, map->flags, map->ref, map->dom); - goto fail_unmap_req; + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } - req->gnt_handles[seg_idx] = map->handle; + reqlist->gnt_handles[seg_idx] = map->handle; } - if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) { + if (reqlist->starting_sector_number + total_sects > + xbb->media_num_sectors) { DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", - ring_req->sector_number, - ring_req->sector_number + nr_sects, xbb->dev_name); - goto fail_unmap_req; + reqlist->starting_sector_number, + reqlist->starting_sector_number + total_sects, + xbb->dev_name); + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } do_dispatch: error = xbb->dispatch_io(xbb, - ring_req, - req, - nseg, + reqlist, operation, bio_flags); if (error != 0) { - if (operation == BIO_FLUSH) - goto fail_send_response; - else - goto fail_unmap_req; + reqlist->status = BLKIF_RSP_ERROR; + goto send_response; } - return; + return (0); +send_response: -fail_unmap_req: - xbb_unmap_req(req); - /* FALLTHROUGH */ + xbb_complete_reqlist(xbb, reqlist); -fail_send_response: - xbb_send_response(xbb, req, BLKIF_RSP_ERROR); - xbb_release_req(xbb, req); - devstat_end_transaction(xbb->xbb_stats, - /*bytes*/0, - req->ds_tag_type, - req->ds_trans_type, - /*now*/NULL, - /*then*/&req->ds_t0); + return (0); +} + +static __inline int +xbb_count_sects(blkif_request_t *ring_req) +{ + int i; + int cur_size = 0; + + for (i = 0; i < ring_req->nr_segments; i++) { + int nsect; + + nsect = (int8_t)(ring_req->seg[i].last_sect - + ring_req->seg[i].first_sect + 1); + if (nsect <= 0) + break; + + cur_size += nsect; + } + + return (cur_size); } /** @@ -1172,95 +1809,210 @@ fail_send_response: static void xbb_run_queue(void *context, int pending) { - struct xbb_softc *xbb; - blkif_back_rings_t *rings; - RING_IDX rp; + struct xbb_softc *xbb; + blkif_back_rings_t *rings; + RING_IDX rp; + uint64_t cur_sector; + int cur_operation; + struct xbb_xen_reqlist *reqlist; - xbb = (struct xbb_softc *)context; - rings = &xbb->rings; + xbb = (struct xbb_softc *)context; + rings = &xbb->rings; /* - * Cache req_prod to avoid accessing a cache line shared - * with the frontend. + * Work gather and dispatch loop. Note that we have a bias here + * towards gathering I/O sent by blockfront. We first gather up + * everything in the ring, as long as we have resources. Then we + * dispatch one request, and then attempt to gather up any + * additional requests that have come in while we were dispatching + * the request. + * + * This allows us to get a clearer picture (via devstat) of how + * many requests blockfront is queueing to us at any given time. */ - rp = rings->common.sring->req_prod; + for (;;) { + int retval; - /* Ensure we see queued requests up to 'rp'. */ - rmb(); + /* + * Initialize reqlist to the last element in the pending + * queue, if there is one. This allows us to add more + * requests to that request list, if we have room. + */ + reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, + xbb_xen_reqlist, links); + if (reqlist != NULL) { + cur_sector = reqlist->next_contig_sector; + cur_operation = reqlist->operation; + } else { + cur_operation = 0; + cur_sector = 0; + } + + /* + * Cache req_prod to avoid accessing a cache line shared + * with the frontend. + */ + rp = rings->common.sring->req_prod; + + /* Ensure we see queued requests up to 'rp'. */ + rmb(); + + /** + * Run so long as there is work to consume and the generation + * of a response will not overflow the ring. + * + * @note There's a 1 to 1 relationship between requests and + * responses, so an overflow should never occur. This + * test is to protect our domain from digesting bogus + * data. Shouldn't we log this? + */ + while (rings->common.req_cons != rp + && RING_REQUEST_CONS_OVERFLOW(&rings->common, + rings->common.req_cons) == 0){ + blkif_request_t ring_req_storage; + blkif_request_t *ring_req; + int cur_size; + + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + ring_req = RING_GET_REQUEST(&xbb->rings.native, + rings->common.req_cons); + break; + case BLKIF_PROTOCOL_X86_32: + { + struct blkif_x86_32_request *ring_req32; + + ring_req32 = RING_GET_REQUEST( + &xbb->rings.x86_32, rings->common.req_cons); + blkif_get_x86_32_req(&ring_req_storage, + ring_req32); + ring_req = &ring_req_storage; + break; + } + case BLKIF_PROTOCOL_X86_64: + { + struct blkif_x86_64_request *ring_req64; + + ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, + rings->common.req_cons); + blkif_get_x86_64_req(&ring_req_storage, + ring_req64); + ring_req = &ring_req_storage; + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } - /** - * Run so long as there is work to consume and the generation - * of a response will not overflow the ring. - * - * @note There's a 1 to 1 relationship between requests and responses, - * so an overflow should never occur. This test is to protect - * our domain from digesting bogus data. Shouldn't we log this? - */ - while (rings->common.req_cons != rp - && RING_REQUEST_CONS_OVERFLOW(&rings->common, - rings->common.req_cons) == 0) { - blkif_request_t ring_req_storage; - blkif_request_t *ring_req; - struct xbb_xen_req *req; - RING_IDX req_ring_idx; - - req = xbb_get_req(xbb); - if (req == NULL) { /* - * Resource shortage has been recorded. - * We'll be scheduled to run once a request - * object frees up due to a completion. + * Check for situations that would require closing + * off this I/O for further coalescing: + * - Coalescing is turned off. + * - Current I/O is out of sequence with the previous + * I/O. + * - Coalesced I/O would be too large. */ - break; - } + if ((reqlist != NULL) + && ((xbb->no_coalesce_reqs != 0) + || ((xbb->no_coalesce_reqs == 0) + && ((ring_req->sector_number != cur_sector) + || (ring_req->operation != cur_operation) + || ((ring_req->nr_segments + reqlist->nr_segments) > + xbb->max_reqlist_segments))))) { + reqlist = NULL; + } - switch (xbb->abi) { - case BLKIF_PROTOCOL_NATIVE: - ring_req = RING_GET_REQUEST(&xbb->rings.native, - rings->common.req_cons); - break; - case BLKIF_PROTOCOL_X86_32: - { - struct blkif_x86_32_request *ring_req32; - - ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32, - rings->common.req_cons); - blkif_get_x86_32_req(&ring_req_storage, ring_req32); - ring_req = &ring_req_storage; - break; + /* + * Grab and check for all resources in one shot. + * If we can't get all of the resources we need, + * the shortage is noted and the thread will get + * woken up when more resources are available. + */ + retval = xbb_get_resources(xbb, &reqlist, ring_req, + xbb->rings.common.req_cons); + + if (retval != 0) { + /* + * Resource shortage has been recorded. + * We'll be scheduled to run once a request + * object frees up due to a completion. + */ + break; + } + + /* + * Signify that we can overwrite this request with + * a response by incrementing our consumer index. + * The response won't be generated until after + * we've already consumed all necessary data out + * of the version of the request in the ring buffer + * (for native mode). We must update the consumer + * index before issueing back-end I/O so there is + * no possibility that it will complete and a + * response be generated before we make room in + * the queue for that response. + */ + xbb->rings.common.req_cons += + BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); + xbb->reqs_received++; + + cur_size = xbb_count_sects(ring_req); + cur_sector = ring_req->sector_number + cur_size; + reqlist->next_contig_sector = cur_sector; + cur_operation = ring_req->operation; } - case BLKIF_PROTOCOL_X86_64: - { - struct blkif_x86_64_request *ring_req64; - - ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64, - rings->common.req_cons); - blkif_get_x86_64_req(&ring_req_storage, ring_req64); - ring_req = &ring_req_storage; + + /* Check for I/O to dispatch */ + reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); + if (reqlist == NULL) { + /* + * We're out of work to do, put the task queue to + * sleep. + */ break; } - default: - panic("Unexpected blkif protocol ABI."); - /* NOTREACHED */ - } /* - * Signify that we can overwrite this request with a - * response by incrementing our consumer index. The - * response won't be generated until after we've already - * consumed all necessary data out of the version of the - * request in the ring buffer (for native mode). We - * must update the consumer index before issueing back-end - * I/O so there is no possibility that it will complete - * and a response be generated before we make room in - * the queue for that response. + * Grab the first request off the queue and attempt + * to dispatch it. */ - req_ring_idx = xbb->rings.common.req_cons; - xbb->rings.common.req_cons += - BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); + STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); + + retval = xbb_dispatch_io(xbb, reqlist); + if (retval != 0) { + /* + * xbb_dispatch_io() returns non-zero only when + * there is a resource shortage. If that's the + * case, re-queue this request on the head of the + * queue, and go to sleep until we have more + * resources. + */ + STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, + reqlist, links); + break; + } else { + /* + * If we still have anything on the queue after + * removing the head entry, that is because we + * met one of the criteria to create a new + * request list (outlined above), and we'll call + * that a forced dispatch for statistical purposes. + * + * Otherwise, if there is only one element on the + * queue, we coalesced everything available on + * the ring and we'll call that a normal dispatch. + */ + reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); - xbb_dispatch_io(xbb, ring_req, req, req_ring_idx); + if (reqlist != NULL) + xbb->forced_dispatch++; + else + xbb->normal_dispatch++; + + xbb->total_dispatch++; + } } } @@ -1285,11 +2037,7 @@ xbb_intr(void *arg) * Backend handler for character device access. * * \param xbb Per-instance xbb configuration structure. - * \param ring_req Front-end's I/O request as pulled from the shared - * communication ring. - * \param req Allocated internal request structure. - * \param nseg The number of valid segments for this request in - * xbb->xbb_sgs. + * \param reqlist Allocated internal request list structure. * \param operation BIO_* I/O operation code. * \param bio_flags Additional bio_flag data to pass to any generated * bios (e.g. BIO_ORDERED).. @@ -1297,28 +2045,30 @@ xbb_intr(void *arg) * \return 0 for success, errno codes for failure. */ static int -xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, - struct xbb_xen_req *req, int nseg, int operation, - int bio_flags) +xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, + int operation, int bio_flags) { struct xbb_dev_data *dev_data; - struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; + struct xbb_xen_req *nreq; off_t bio_offset; struct bio *bio; struct xbb_sg *xbb_sg; u_int nbio; u_int bio_idx; + u_int nseg; u_int seg_idx; int error; dev_data = &xbb->backend.dev; - bio_offset = (off_t)ring_req->sector_number + bio_offset = (off_t)reqlist->starting_sector_number << xbb->sector_size_shift; error = 0; nbio = 0; bio_idx = 0; if (operation == BIO_FLUSH) { + nreq = STAILQ_FIRST(&reqlist->contig_req_list); bio = g_new_bio(); if (unlikely(bio == NULL)) { DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); @@ -1332,19 +2082,21 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = xbb_bio_done; - bio->bio_caller1 = req; + bio->bio_caller1 = nreq; bio->bio_pblkno = 0; - req->pendcnt = 1; + nreq->pendcnt = 1; - (*dev_data->csw->d_strategy)(bios[bio_idx]); + (*dev_data->csw->d_strategy)(bio); return (0); } - for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs; - seg_idx < nseg; - seg_idx++, xbb_sg++) { + xbb_sg = xbb->xbb_sgs; + bio = NULL; + nseg = reqlist->nr_segments; + + for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * KVA will not be contiguous, so any additional @@ -1353,10 +2105,10 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, if ((bio != NULL) && (xbb_sg->first_sect != 0)) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { - printf("%s: Discontiguous I/O request from " - "domain %d ends on non-sector " - "boundary\n", __func__, - xbb->otherend_id); + printf("%s: Discontiguous I/O request " + "from domain %d ends on " + "non-sector boundary\n", + __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } @@ -1365,12 +2117,12 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, if (bio == NULL) { /* - * Make sure that the start of this bio is aligned - * to a device sector. + * Make sure that the start of this bio is + * aligned to a device sector. */ - if ((bio_offset & (xbb->sector_size - 1)) != 0) { - printf("%s: Misaligned I/O request from " - "domain %d\n", __func__, + if ((bio_offset & (xbb->sector_size - 1)) != 0){ + printf("%s: Misaligned I/O request " + "from domain %d\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; @@ -1385,12 +2137,11 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, bio->bio_flags |= bio_flags; bio->bio_dev = dev_data->cdev; bio->bio_offset = bio_offset; - bio->bio_data = xbb_req_ioaddr(req, seg_idx, - xbb_sg->first_sect); + bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, + xbb_sg->first_sect); bio->bio_done = xbb_bio_done; - bio->bio_caller1 = req; - bio->bio_pblkno = bio_offset - >> xbb->sector_size_shift; + bio->bio_caller1 = reqlist; + bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; } bio->bio_length += xbb_sg->nsect << 9; @@ -1400,10 +2151,10 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { - printf("%s: Discontiguous I/O request from " - "domain %d ends on non-sector " - "boundary\n", __func__, - xbb->otherend_id); + printf("%s: Discontiguous I/O request " + "from domain %d ends on " + "non-sector boundary\n", + __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } @@ -1415,7 +2166,7 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, } } - req->pendcnt = nbio; + reqlist->pendcnt = nbio; for (bio_idx = 0; bio_idx < nbio; bio_idx++) { @@ -1423,10 +2174,10 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, vm_offset_t kva_offset; kva_offset = (vm_offset_t)bios[bio_idx]->bio_data - - (vm_offset_t)req->bounce; + - (vm_offset_t)reqlist->bounce; if (operation == BIO_WRITE) { memcpy(bios[bio_idx]->bio_data, - (uint8_t *)req->kva + kva_offset, + (uint8_t *)reqlist->kva + kva_offset, bios[bio_idx]->bio_bcount); } #endif @@ -1438,7 +2189,7 @@ xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, fail_free_bios: for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) g_destroy_bio(bios[bio_idx]); - + return (error); } @@ -1446,24 +2197,21 @@ fail_free_bios: * Backend handler for file access. * * \param xbb Per-instance xbb configuration structure. - * \param ring_req Front-end's I/O request as pulled from the shared - * communication ring. - * \param req Allocated internal request structure. - * \param nseg The number of valid segments for this request in - * xbb->xbb_sgs. + * \param reqlist Allocated internal request list. * \param operation BIO_* I/O operation code. - * \param bio_flags Additional bio_flag data to pass to any generated bios + * \param flags Additional bio_flag data to pass to any generated bios * (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int -xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, - struct xbb_xen_req *req, int nseg, int operation, - int flags) +xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, + int operation, int flags) { struct xbb_file_data *file_data; u_int seg_idx; + u_int nseg; + off_t sectors_sent; struct uio xuio; struct xbb_sg *xbb_sg; struct iovec *xiovec; @@ -1475,11 +2223,10 @@ xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, int error; file_data = &xbb->backend.file; + sectors_sent = 0; error = 0; bzero(&xuio, sizeof(xuio)); - req->pendcnt = 0; - switch (operation) { case BIO_READ: xuio.uio_rw = UIO_READ; @@ -1509,37 +2256,39 @@ xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, panic("invalid operation %d", operation); /* NOTREACHED */ } - xuio.uio_offset = (vm_offset_t)ring_req->sector_number + xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number << xbb->sector_size_shift; - xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = file_data->xiovecs; xuio.uio_iovcnt = 0; + xbb_sg = xbb->xbb_sgs; + nseg = reqlist->nr_segments; - for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs; - seg_idx < nseg; seg_idx++, xbb_sg++) { + for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* - * If the first sector is not 0, the KVA will not be - * contiguous and we'll need to go on to another segment. + * If the first sector is not 0, the KVA will + * not be contiguous and we'll need to go on + * to another segment. */ if (xbb_sg->first_sect != 0) xiovec = NULL; if (xiovec == NULL) { xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; - xiovec->iov_base = xbb_req_ioaddr(req, seg_idx, - xbb_sg->first_sect); + xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, + seg_idx, xbb_sg->first_sect); #ifdef XBB_USE_BOUNCE_BUFFERS /* - * Store the address of the incoming buffer at this - * particular offset as well, so we can do the copy - * later without having to do more work to - * recalculate this address. + * Store the address of the incoming + * buffer at this particular offset + * as well, so we can do the copy + * later without having to do more + * work to recalculate this address. */ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; - *p_vaddr = xbb_req_vaddr(req, seg_idx, - xbb_sg->first_sect); + *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, + xbb_sg->first_sect); #endif /* XBB_USE_BOUNCE_BUFFERS */ xiovec->iov_len = 0; xuio.uio_iovcnt++; @@ -1550,9 +2299,9 @@ xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, xuio.uio_resid += xbb_sg->nsect << 9; /* - * If the last sector is not the full page size count, - * the next segment will not be contiguous in KVA and we - * need a new iovec. + * If the last sector is not the full page + * size count, the next segment will not be + * contiguous in KVA and we need a new iovec. */ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) xiovec = NULL; @@ -1676,23 +2425,10 @@ xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, bailout_send_response: - /* - * All I/O is already done, send the response. A lock is not - * necessary here because we're single threaded, and therefore the - * only context accessing this request right now. If that changes, - * we may need some locking here. - */ - xbb_unmap_req(req); - xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY : - BLKIF_RSP_ERROR); - devstat_end_transaction(xbb->xbb_stats, - /*bytes*/error == 0 ? req->nr_512b_sectors << 9 - : 0, - req->ds_tag_type, - req->ds_trans_type, - /*now*/NULL, - /*then*/&req->ds_t0); - xbb_release_req(xbb, req); + if (error != 0) + reqlist->status = BLKIF_RSP_ERROR; + + xbb_complete_reqlist(xbb, reqlist); return (0); } @@ -1913,6 +2649,12 @@ xbb_open_backend(struct xbb_softc *xbb) DPRINTF("opening dev=%s\n", xbb->dev_name); + if (rootvnode == NULL) { + xenbus_dev_fatal(xbb->dev, ENOENT, + "Root file system not mounted"); + return (ENOENT); + } + if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; @@ -1996,11 +2738,39 @@ xbb_open_backend(struct xbb_softc *xbb) /*------------------------ Inter-Domain Communication ------------------------*/ /** - * Cleanup all inter-domain communication mechanisms. + * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xbb Per-instance xbb configuration structure. */ static void +xbb_free_communication_mem(struct xbb_softc *xbb) +{ + if (xbb->kva != 0) { +#ifndef XENHVM + kmem_free(kernel_map, xbb->kva, xbb->kva_size); +#else + if (xbb->pseudo_phys_res != NULL) { + bus_release_resource(xbb->dev, SYS_RES_MEMORY, + xbb->pseudo_phys_res_id, + xbb->pseudo_phys_res); + xbb->pseudo_phys_res = NULL; + } +#endif + } + xbb->kva = 0; + xbb->gnt_base_addr = 0; + if (xbb->kva_free != NULL) { + free(xbb->kva_free, M_XENBLOCKBACK); + xbb->kva_free = NULL; + } +} + +/** + * Cleanup all inter-domain communication mechanisms. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int xbb_disconnect(struct xbb_softc *xbb) { struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; @@ -2011,13 +2781,24 @@ xbb_disconnect(struct xbb_softc *xbb) DPRINTF("\n"); if ((xbb->flags & XBBF_RING_CONNECTED) == 0) - return; + return (0); if (xbb->irq != 0) { unbind_from_irqhandler(xbb->irq); xbb->irq = 0; } + mtx_unlock(&xbb->lock); + taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); + mtx_lock(&xbb->lock); + + /* + * No new interrupts can generate work, but we must wait + * for all currently active requests to drain. + */ + if (xbb->active_request_count != 0) + return (EAGAIN); + for (ring_idx = 0, op = ops; ring_idx < xbb->ring_config.ring_pages; ring_idx++, op++) { @@ -2033,7 +2814,37 @@ xbb_disconnect(struct xbb_softc *xbb) if (error != 0) panic("Grant table op failed (%d)", error); + xbb_free_communication_mem(xbb); + + if (xbb->requests != NULL) { + free(xbb->requests, M_XENBLOCKBACK); + xbb->requests = NULL; + } + + if (xbb->request_lists != NULL) { + struct xbb_xen_reqlist *reqlist; + int i; + + /* There is one request list for ever allocated request. */ + for (i = 0, reqlist = xbb->request_lists; + i < xbb->max_requests; i++, reqlist++){ +#ifdef XBB_USE_BOUNCE_BUFFERS + if (reqlist->bounce != NULL) { + free(reqlist->bounce, M_XENBLOCKBACK); + reqlist->bounce = NULL; + } +#endif + if (reqlist->gnt_handles != NULL) { + free(reqlist->gnt_handles, M_XENBLOCKBACK); + reqlist->gnt_handles = NULL; + } + } + free(xbb->request_lists, M_XENBLOCKBACK); + xbb->request_lists = NULL; + } + xbb->flags &= ~XBBF_RING_CONNECTED; + return (0); } /** @@ -2135,7 +2946,7 @@ xbb_connect_ring(struct xbb_softc *xbb) INTR_TYPE_BIO | INTR_MPSAFE, &xbb->irq); if (error) { - xbb_disconnect(xbb); + (void)xbb_disconnect(xbb); xenbus_dev_fatal(xbb->dev, error, "binding event channel"); return (error); } @@ -2145,6 +2956,10 @@ xbb_connect_ring(struct xbb_softc *xbb) return 0; } +/* Needed to make bit_alloc() macro work */ +#define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ + M_NOWAIT|M_ZERO); + /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our @@ -2158,9 +2973,18 @@ xbb_connect_ring(struct xbb_softc *xbb) static int xbb_alloc_communication_mem(struct xbb_softc *xbb) { - xbb->kva_size = (xbb->ring_config.ring_pages - + (xbb->max_requests * xbb->max_request_segments)) - * PAGE_SIZE; + xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; + xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; + xbb->kva_size = xbb->reqlist_kva_size + + (xbb->ring_config.ring_pages * PAGE_SIZE); + + xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); + if (xbb->kva_free == NULL) + return (ENOMEM); + + DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", + device_get_nameunit(xbb->dev), xbb->kva_size, + xbb->reqlist_kva_size); #ifndef XENHVM xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); if (xbb->kva == 0) @@ -2185,31 +3009,11 @@ xbb_alloc_communication_mem(struct xbb_softc *xbb) xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); #endif /* XENHVM */ - return (0); -} -/** - * Free dynamically allocated KVA or pseudo-physical address allocations. - * - * \param xbb Per-instance xbb configuration structure. - */ -static void -xbb_free_communication_mem(struct xbb_softc *xbb) -{ - if (xbb->kva != 0) { -#ifndef XENHVM - kmem_free(kernel_map, xbb->kva, xbb->kva_size); -#else - if (xbb->pseudo_phys_res != NULL) { - bus_release_resource(xbb->dev, SYS_RES_MEMORY, - xbb->pseudo_phys_res_id, - xbb->pseudo_phys_res); - xbb->pseudo_phys_res = NULL; - } -#endif - } - xbb->kva = 0; - xbb->gnt_base_addr = 0; + DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", + device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, + (uintmax_t)xbb->gnt_base_addr); + return (0); } /** @@ -2228,6 +3032,14 @@ xbb_collect_frontend_info(struct xbb_softc *xbb) otherend_path = xenbus_get_otherend_path(xbb->dev); /* + * Protocol defaults valid even if all negotiation fails. + */ + xbb->ring_config.ring_pages = 1; + xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); + xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; + + /* * Mandatory data (used in all versions of the protocol) first. */ error = xs_gather(XST_NIL, otherend_path, @@ -2255,19 +3067,19 @@ xbb_collect_frontend_info(struct xbb_softc *xbb) * tree. */ (void)xs_scanf(XST_NIL, otherend_path, - "ring-pages", NULL, "%" PRIu32, + "ring-pages", NULL, "%u", &xbb->ring_config.ring_pages); (void)xs_scanf(XST_NIL, otherend_path, - "max-requests", NULL, "%" PRIu32, + "max-requests", NULL, "%u", &xbb->max_requests); (void)xs_scanf(XST_NIL, otherend_path, - "max-request-segments", NULL, "%" PRIu32, + "max-request-segments", NULL, "%u", &xbb->max_request_segments); (void)xs_scanf(XST_NIL, otherend_path, - "max-request-size", NULL, "%" PRIu32, + "max-request-size", NULL, "%u", &xbb->max_request_size); if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { @@ -2360,8 +3172,6 @@ xbb_alloc_requests(struct xbb_softc *xbb) { struct xbb_xen_req *req; struct xbb_xen_req *last_req; - uint8_t *req_kva; - u_long gnt_base; /* * Allocate request book keeping datastructures. @@ -2374,43 +3184,68 @@ xbb_alloc_requests(struct xbb_softc *xbb) return (ENOMEM); } - req_kva = (uint8_t *)xbb->kva; - gnt_base = xbb->gnt_base_addr; req = xbb->requests; last_req = &xbb->requests[xbb->max_requests - 1]; + STAILQ_INIT(&xbb->request_free_stailq); while (req <= last_req) { + STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); + req++; + } + return (0); +} + +static int +xbb_alloc_request_lists(struct xbb_softc *xbb) +{ + int i; + struct xbb_xen_reqlist *reqlist; + + /* + * If no requests can be merged, we need 1 request list per + * in flight request. + */ + xbb->request_lists = malloc(xbb->max_requests * + sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (xbb->request_lists == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request list structures"); + return (ENOMEM); + } + + STAILQ_INIT(&xbb->reqlist_free_stailq); + STAILQ_INIT(&xbb->reqlist_pending_stailq); + for (i = 0; i < xbb->max_requests; i++) { int seg; - req->xbb = xbb; - req->kva = req_kva; - req->gnt_handles = malloc(xbb->max_request_segments - * sizeof(*req->gnt_handles), - M_XENBLOCKBACK, M_NOWAIT|M_ZERO); - if (req->gnt_handles == NULL) { - xenbus_dev_fatal(xbb->dev, ENOMEM, - "Unable to allocate request " - "grant references"); - return (ENOMEM); - } + reqlist = &xbb->request_lists[i]; + + reqlist->xbb = xbb; + #ifdef XBB_USE_BOUNCE_BUFFERS - req->bounce = malloc(xbb->max_request_size, - M_XENBLOCKBACK, M_NOWAIT); - if (req->bounce == NULL) { + reqlist->bounce = malloc(xbb->max_reqlist_size, + M_XENBLOCKBACK, M_NOWAIT); + if (reqlist->bounce == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "bounce buffers"); return (ENOMEM); } #endif /* XBB_USE_BOUNCE_BUFFERS */ - req->gnt_base = gnt_base; - req_kva += xbb->max_request_segments * PAGE_SIZE; - gnt_base += xbb->max_request_segments * PAGE_SIZE; - SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); - for (seg = 0; seg < xbb->max_request_segments; seg++) - req->gnt_handles[seg] = GRANT_REF_INVALID; + reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * + sizeof(*reqlist->gnt_handles), + M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (reqlist->gnt_handles == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request " + "grant references"); + return (ENOMEM); + } + + for (seg = 0; seg < xbb->max_reqlist_segments; seg++) + reqlist->gnt_handles[seg] = GRANT_REF_INVALID; - req++; + STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); } return (0); } @@ -2491,6 +3326,22 @@ xbb_connect(struct xbb_softc *xbb) if (xbb_collect_frontend_info(xbb) != 0) return; + xbb->flags &= ~XBBF_SHUTDOWN; + + /* + * We limit the maximum number of reqlist segments to the maximum + * number of segments in the ring, or our absolute maximum, + * whichever is smaller. + */ + xbb->max_reqlist_segments = MIN(xbb->max_request_segments * + xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); + + /* + * The maximum size is simply a function of the number of segments + * we can handle. + */ + xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; + /* Allocate resources whose size depends on front-end configuration. */ error = xbb_alloc_communication_mem(xbb); if (error != 0) { @@ -2505,6 +3356,12 @@ xbb_connect(struct xbb_softc *xbb) return; } + error = xbb_alloc_request_lists(xbb); + if (error != 0) { + /* Specific errors are reported by xbb_alloc_request_lists(). */ + return; + } + /* * Connect communication channel. */ @@ -2520,7 +3377,7 @@ xbb_connect(struct xbb_softc *xbb) * in this connection, and waiting for a front-end state * change will not help the situation. */ - xbb_disconnect(xbb); + (void)xbb_disconnect(xbb); return; } @@ -2542,7 +3399,7 @@ xbb_connect(struct xbb_softc *xbb) static int xbb_shutdown(struct xbb_softc *xbb) { - static int in_shutdown; + int error; DPRINTF("\n"); @@ -2553,7 +3410,7 @@ xbb_shutdown(struct xbb_softc *xbb) * the same time. Tell the caller that hits this * race to try back later. */ - if (in_shutdown != 0) + if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) return (EAGAIN); DPRINTF("\n"); @@ -2561,20 +3418,30 @@ xbb_shutdown(struct xbb_softc *xbb) /* Indicate shutdown is in progress. */ xbb->flags |= XBBF_SHUTDOWN; - /* Wait for requests to complete. */ - if (xbb->active_request_count != 0) - return (EAGAIN); - - DPRINTF("\n"); - /* Disconnect from the front-end. */ - xbb_disconnect(xbb); + error = xbb_disconnect(xbb); + if (error != 0) { + /* + * Requests still outstanding. We'll be called again + * once they complete. + */ + KASSERT(error == EAGAIN, + ("%s: Unexpected xbb_disconnect() failure %d", + __func__, error)); - in_shutdown = 1; + return (error); + } + + DPRINTF("\n"); + + xbb->flags |= XBBF_IN_SHUTDOWN; mtx_unlock(&xbb->lock); - xenbus_set_state(xbb->dev, XenbusStateClosed); + + if (xenbus_get_state(xbb->dev) < XenbusStateClosing) + xenbus_set_state(xbb->dev, XenbusStateClosing); + mtx_lock(&xbb->lock); - in_shutdown = 0; + xbb->flags &= ~XBBF_IN_SHUTDOWN; /* Indicate to xbb_detach() that is it safe to proceed. */ wakeup(xbb); @@ -2634,6 +3501,77 @@ xbb_probe(device_t dev) } /** + * Setup sysctl variables to control various Block Back parameters. + * + * \param xbb Xen Block Back softc. + * + */ +static void +xbb_setup_sysctl(struct xbb_softc *xbb) +{ + struct sysctl_ctx_list *sysctl_ctx = NULL; + struct sysctl_oid *sysctl_tree = NULL; + + sysctl_ctx = device_get_sysctl_ctx(xbb->dev); + if (sysctl_ctx == NULL) + return; + + sysctl_tree = device_get_sysctl_tree(xbb->dev); + if (sysctl_tree == NULL) + return; + + SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, + "fake the flush command"); + + SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, + "send a real flush for N flush requests"); + + SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, + "Don't coalesce contiguous requests"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "reqs_received", CTLFLAG_RW, &xbb->reqs_received, + "how many I/O requests we have received"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, + "how many I/O requests have been completed"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, + "how many I/O dispatches were forced"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, + "how many I/O dispatches were normal"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, + "total number of I/O dispatches"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, + "how many times we have run out of KVA"); + + SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "request_shortages", CTLFLAG_RW, + &xbb->request_shortages, + "how many times we have run out of requests"); + + SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, + "maximum outstanding requests (negotiated)"); + + SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "max_request_segments", CTLFLAG_RD, + &xbb->max_request_segments, 0, + "maximum number of pages per requests (negotiated)"); +} + +/** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Block Back instance. @@ -2643,8 +3581,8 @@ xbb_probe(device_t dev) static int xbb_attach(device_t dev) { - struct xbb_softc *xbb; - int error; + struct xbb_softc *xbb; + int error; DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); @@ -2658,15 +3596,6 @@ xbb_attach(device_t dev) xbb->otherend_id = xenbus_get_otherend_id(dev); TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); - SLIST_INIT(&xbb->request_free_slist); - - /* - * Protocol defaults valid even if all negotiation fails. - */ - xbb->ring_config.ring_pages = 1; - xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); - xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; - xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; /* * Publish protocol capabilities for consumption by the @@ -2763,6 +3692,18 @@ xbb_attach(device_t dev) DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); + + xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), + xbb->sector_size, + DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT + | DEVSTAT_TYPE_IF_OTHER, + DEVSTAT_PRIORITY_OTHER); + /* + * Setup sysctl variables. + */ + xbb_setup_sysctl(xbb); + /* * Create a taskqueue for doing work that must occur from a * thread context. @@ -2797,7 +3738,7 @@ xbb_attach(device_t dev) } /** - * Detach from a block back device instanced. + * Detach from a block back device instance. * * \param dev NewBus device object representing this Xen Block Back instance. * @@ -2823,7 +3764,6 @@ xbb_detach(device_t dev) "xbb_shutdown", 0); } mtx_unlock(&xbb->lock); - mtx_destroy(&xbb->lock); DPRINTF("\n"); @@ -2833,8 +3773,10 @@ xbb_detach(device_t dev) if (xbb->xbb_stats != NULL) devstat_remove_entry(xbb->xbb_stats); + if (xbb->xbb_stats_in != NULL) + devstat_remove_entry(xbb->xbb_stats_in); + xbb_close_backend(xbb); - xbb_free_communication_mem(xbb); if (xbb->dev_mode != NULL) { free(xbb->dev_mode, M_XENBUS); @@ -2851,29 +3793,7 @@ xbb_detach(device_t dev) xbb->dev_name = NULL; } - if (xbb->requests != NULL) { - struct xbb_xen_req *req; - struct xbb_xen_req *last_req; - - req = xbb->requests; - last_req = &xbb->requests[xbb->max_requests - 1]; - while (req <= last_req) { -#ifdef XBB_USE_BOUNCE_BUFFERS - if (req->bounce != NULL) { - free(req->bounce, M_XENBLOCKBACK); - req->bounce = NULL; - } -#endif - if (req->gnt_handles != NULL) { - free (req->gnt_handles, M_XENBLOCKBACK); - req->gnt_handles = NULL; - } - req++; - } - free(xbb->requests, M_XENBLOCKBACK); - xbb->requests = NULL; - } - + mtx_destroy(&xbb->lock); return (0); } @@ -2926,22 +3846,24 @@ xbb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xbb_softc *xbb = device_get_softc(dev); - DPRINTF("state=%s\n", xenbus_strstate(frontend_state)); + DPRINTF("frontend_state=%s, xbb_state=%s\n", + xenbus_strstate(frontend_state), + xenbus_strstate(xenbus_get_state(xbb->dev))); switch (frontend_state) { case XenbusStateInitialising: - case XenbusStateClosing: break; case XenbusStateInitialised: case XenbusStateConnected: xbb_connect(xbb); break; + case XenbusStateClosing: case XenbusStateClosed: - case XenbusStateInitWait: - mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); + if (frontend_state == XenbusStateClosed) + xenbus_set_state(xbb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", diff --git a/sys/geom/geom.h b/sys/geom/geom.h index 1dc6eb1..6256572 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -76,6 +76,7 @@ typedef void g_orphan_t (struct g_consumer *); typedef void g_start_t (struct bio *); typedef void g_spoiled_t (struct g_consumer *); +typedef void g_attrchanged_t (struct g_consumer *, const char *attr); typedef void g_dumpconf_t (struct sbuf *, const char *indent, struct g_geom *, struct g_consumer *, struct g_provider *); @@ -100,6 +101,7 @@ struct g_class { */ g_start_t *start; g_spoiled_t *spoiled; + g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; @@ -128,6 +130,7 @@ struct g_geom { int rank; g_start_t *start; g_spoiled_t *spoiled; + g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; @@ -217,6 +220,7 @@ struct g_classifier_hook { /* geom_dev.c */ struct cdev; void g_dev_print(void); +void g_dev_physpath_changed(void); struct g_provider *g_dev_getprovider(struct cdev *dev); /* geom_dump.c */ @@ -232,6 +236,7 @@ typedef void g_event_t(void *, int flag); int g_post_event(g_event_t *func, void *arg, int flag, ...); int g_waitfor_event(g_event_t *func, void *arg, int flag, ...); void g_cancel_event(void *ref); +int g_attr_changed(struct g_provider *pp, const char *attr, int flag); void g_orphan_provider(struct g_provider *pp, int error); void g_waitidlelock(void); diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index f291b32..210f2ee 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/conf.h> +#include <sys/ctype.h> #include <sys/bio.h> #include <sys/lock.h> #include <sys/mutex.h> @@ -52,6 +53,12 @@ __FBSDID("$FreeBSD$"); #include <sys/limits.h> #include <geom/geom.h> #include <geom/geom_int.h> +#include <machine/stdarg.h> + +/* + * Use the consumer private field to reference a physdev alias (if any). + */ +#define cp_alias_dev private static d_open_t g_dev_open; static d_close_t g_dev_close; @@ -72,12 +79,14 @@ static struct cdevsw g_dev_cdevsw = { static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; +static g_attrchanged_t g_dev_attrchanged; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .taste = g_dev_taste, .orphan = g_dev_orphan, + .attrchanged = g_dev_attrchanged }; void @@ -93,6 +102,40 @@ g_dev_print(void) printf("\n"); } +static void +g_dev_attrchanged(struct g_consumer *cp, const char *attr) +{ + + if (strcmp(attr, "GEOM::physpath") != 0) + return; + + if (g_access(cp, 1, 0, 0) == 0) { + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = + g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + g_access(cp, -1, 0, 0); + if (error == 0 && strlen(physpath) != 0) { + struct cdev *dev; + struct cdev *old_alias_dev; + struct cdev **alias_devp; + + dev = cp->geom->softc; + old_alias_dev = cp->cp_alias_dev; + alias_devp = (struct cdev **)&cp->cp_alias_dev; + make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, + dev, old_alias_dev, physpath); + } else if (cp->cp_alias_dev) { + destroy_dev((struct cdev *)cp->cp_alias_dev); + cp->cp_alias_dev = NULL; + } + g_free(physpath); + } +} + struct g_provider * g_dev_getprovider(struct cdev *dev) { @@ -107,7 +150,6 @@ g_dev_getprovider(struct cdev *dev) return (cp->provider); } - static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { @@ -167,6 +209,9 @@ g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) adev->si_drv1 = gp; adev->si_drv2 = cp; } + + g_dev_attrchanged(cp, "GEOM::physpath"); + return (gp); } @@ -365,6 +410,11 @@ g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread case DIOCGSTRIPEOFFSET: *(off_t *)data = cp->provider->stripeoffset; break; + case DIOCGPHYSPATH: + error = g_io_getattr("GEOM::physpath", cp, &i, data); + if (error == 0 && *(char *)data == '\0') + error = ENOENT; + break; default: if (cp->provider->geom->ioctl != NULL) { error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td); diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index e663e3d..17cae68 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -347,6 +347,15 @@ g_disk_start(struct bio *bp) } while (bp2 != NULL); break; case BIO_GETATTR: + /* Give the driver a chance to override */ + if (dp->d_getattr != NULL) { + if (bp->bio_disk == NULL) + bp->bio_disk = dp; + error = dp->d_getattr(bp); + if (error != -1) + break; + error = EJUSTRETURN; + } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; @@ -582,6 +591,18 @@ disk_gone(struct disk *dp) g_wither_provider(pp, ENXIO); } +void +disk_attr_changed(struct disk *dp, const char *attr, int flag) +{ + struct g_geom *gp; + struct g_provider *pp; + + gp = dp->d_geom; + if (gp != NULL) + LIST_FOREACH(pp, &gp->provider, provider) + (void)g_attr_changed(pp, attr, flag); +} + static void g_kern_disks(void *p, int flag __unused) { diff --git a/sys/geom/geom_disk.h b/sys/geom/geom_disk.h index 2d5f15d..e92f4aa 100644 --- a/sys/geom/geom_disk.h +++ b/sys/geom/geom_disk.h @@ -49,6 +49,7 @@ struct disk; typedef int disk_open_t(struct disk *); typedef int disk_close_t(struct disk *); typedef void disk_strategy_t(struct bio *bp); +typedef int disk_getattr_t(struct bio *bp); typedef int disk_ioctl_t(struct disk *, u_long cmd, void *data, int fflag, struct thread *td); /* NB: disk_ioctl_t SHALL be cast'able to d_ioctl_t */ @@ -75,6 +76,7 @@ struct disk { disk_strategy_t *d_strategy; disk_ioctl_t *d_ioctl; dumper_t *d_dump; + disk_getattr_t *d_getattr; /* Info fields from driver to geom_disk.c. Valid when open */ u_int d_sectorsize; @@ -104,6 +106,7 @@ struct disk *disk_alloc(void); void disk_create(struct disk *disk, int version); void disk_destroy(struct disk *disk); void disk_gone(struct disk *disk); +void disk_attr_changed(struct disk *dp, const char *attr, int flag); #define DISK_VERSION_00 0x58561059 #define DISK_VERSION_01 0x5856105a diff --git a/sys/geom/geom_event.c b/sys/geom/geom_event.c index d6e5498..1e2fc8d 100644 --- a/sys/geom/geom_event.c +++ b/sys/geom/geom_event.c @@ -110,6 +110,53 @@ g_waitidlelock(void) } #endif +struct g_attrchanged_args { + struct g_provider *pp; + const char *attr; +}; + +static void +g_attr_changed_event(void *arg, int flag) +{ + struct g_attrchanged_args *args; + struct g_provider *pp; + struct g_consumer *cp; + struct g_consumer *next_cp; + + args = arg; + pp = args->pp; + + g_topology_assert(); + if (flag != EV_CANCEL && g_shutdown == 0) { + + /* + * Tell all consumers of the change. + */ + LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { + if (cp->geom->attrchanged != NULL) + cp->geom->attrchanged(cp, args->attr); + } + } + g_free(args); +} + +int +g_attr_changed(struct g_provider *pp, const char *attr, int flag) +{ + struct g_attrchanged_args *args; + int error; + + args = g_malloc(sizeof *args, flag); + if (args == NULL) + return (ENOMEM); + args->pp = pp; + args->attr = attr; + error = g_post_event(g_attr_changed_event, args, flag, pp, NULL); + if (error != 0) + g_free(args); + return (error); +} + void g_orphan_provider(struct g_provider *pp, int error) { diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c index a0958f3..6e2589b 100644 --- a/sys/geom/geom_subr.c +++ b/sys/geom/geom_subr.c @@ -350,6 +350,7 @@ g_new_geomf(struct g_class *mp, const char *fmt, ...) /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; + gp->attrchanged = mp->attrchanged; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index ec5b113..c4548c6 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -308,6 +308,7 @@ options USB_DEBUG # enable debug msgs device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device ehci # EHCI PCI->USB interface (USB 2.0) +device xhci # XHCI PCI->USB interface (USB 3.0) device usb # USB Bus (required) #device udbp # USB Double Bulk Pipe devices (needs netgraph) device uhid # "Human Interface Devices" @@ -350,3 +351,11 @@ device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC 2734,3146) device dcons # Dumb console driver device dcons_crom # Configuration ROM for dcons + +# Sound support +device sound # Generic sound driver (required) +device snd_es137x # Ensoniq AudioPCI ES137x +device snd_hda # Intel High Definition Audio +device snd_ich # Intel, NVidia and other ICH AC'97 Audio +device snd_uaudio # USB Audio +device snd_via8233 # VIA VT8233x Audio diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c index 59b876c..a4d90c7 100644 --- a/sys/kern/kern_conf.c +++ b/sys/kern/kern_conf.c @@ -963,6 +963,68 @@ make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev, return (res); } +int +make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev, + struct cdev *old_alias, const char *physpath) +{ + char *devfspath; + int physpath_len; + int max_parentpath_len; + int parentpath_len; + int devfspathbuf_len; + int mflags; + int ret; + + *cdev = NULL; + devfspath = NULL; + physpath_len = strlen(physpath); + ret = EINVAL; + if (physpath_len == 0) + goto out; + + if (strncmp("id1,", physpath, 4) == 0) { + physpath += 4; + physpath_len -= 4; + if (physpath_len == 0) + goto out; + } + + max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1; + parentpath_len = strlen(pdev->si_name); + if (max_parentpath_len < parentpath_len) { + printf("make_dev_physpath_alias: WARNING - Unable to alias %s " + "to %s/%s - path too long\n", + pdev->si_name, physpath, pdev->si_name); + ret = ENAMETOOLONG; + goto out; + } + + mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK; + devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1; + devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags); + if (devfspath == NULL) { + ret = ENOMEM; + goto out; + } + + sprintf(devfspath, "%s/%s", physpath, pdev->si_name); + if (old_alias != NULL + && strcmp(old_alias->si_name, devfspath) == 0) { + /* Retain the existing alias. */ + *cdev = old_alias; + old_alias = NULL; + ret = 0; + } else { + ret = make_dev_alias_p(flags, cdev, pdev, devfspath); + } +out: + if (old_alias != NULL) + destroy_dev(old_alias); + if (devfspath != NULL) + free(devfspath, M_DEVBUF); + return (ret); +} + static void destroy_devl(struct cdev *dev) { diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 6510e13..bb25d17 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -701,8 +701,9 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options, */ if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { PROC_LOCK(p); - p->p_oppid = 0; proc_reparent(p, t); + p->p_pptr->p_dbg_child--; + p->p_oppid = 0; PROC_UNLOCK(p); pksignal(t, SIGCHLD, p->p_ksi); wakeup(t); @@ -794,7 +795,8 @@ kern_wait(struct thread *td, pid_t pid, int *status, int options, pid = -q->p_pgid; PROC_UNLOCK(q); } - if (options &~ (WUNTRACED|WNOHANG|WCONTINUED|WNOWAIT|WLINUXCLONE)) + /* If we don't know the option, just return. */ + if (options & ~(WUNTRACED|WNOHANG|WCONTINUED|WNOWAIT|WLINUXCLONE)) return (EINVAL); loop: if (q->p_flag & P_STATCHILD) { @@ -873,7 +875,10 @@ loop: } if (nfound == 0) { sx_xunlock(&proctree_lock); - return (ECHILD); + if (td->td_proc->p_dbg_child) + return (0); + else + return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c index 24963d5..eaf6427 100644 --- a/sys/kern/subr_devstat.c +++ b/sys/kern/subr_devstat.c @@ -49,8 +49,9 @@ static long devstat_generation = 1; static int devstat_version = DEVSTAT_VERSION; static int devstat_current_devnumber; static struct mtx devstat_mutex; +MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); -static struct devstatlist device_statq; +static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); static struct devstat *devstat_alloc(void); static void devstat_free(struct devstat *); static void devstat_add_entry(struct devstat *ds, const void *dev_name, @@ -70,13 +71,7 @@ devstat_new_entry(const void *dev_name, devstat_priority priority) { struct devstat *ds; - static int once; - if (!once) { - STAILQ_INIT(&device_statq); - mtx_init(&devstat_mutex, "devstat", NULL, MTX_DEF); - once = 1; - } mtx_assert(&devstat_mutex, MA_NOTOWNED); ds = devstat_alloc(); @@ -476,8 +471,9 @@ devstat_alloc(void) mtx_assert(&devstat_mutex, MA_NOTOWNED); if (!once) { - make_dev_credf(MAKEDEV_ETERNAL, &devstat_cdevsw, 0, NULL, - UID_ROOT, GID_WHEEL, 0400, DEVSTAT_DEVICE_NAME); + make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, + &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, + DEVSTAT_DEVICE_NAME); once = 1; } spp2 = NULL; diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c index 1d67864..c2f6e99 100644 --- a/sys/kern/subr_kdb.c +++ b/sys/kern/subr_kdb.c @@ -244,29 +244,44 @@ kdb_reboot(void) #define KEY_CRTLP 16 /* ^P */ #define KEY_CRTLR 18 /* ^R */ +/* States of th KDB "alternate break sequence" detecting state machine. */ +enum { + KDB_ALT_BREAK_SEEN_NONE, + KDB_ALT_BREAK_SEEN_CR, + KDB_ALT_BREAK_SEEN_CR_TILDE, +}; + int kdb_alt_break(int key, int *state) { int brk; + /* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */ + if (key == KEY_CR) { + *state = KDB_ALT_BREAK_SEEN_CR; + return (0); + } + brk = 0; switch (*state) { - case 0: - if (key == KEY_CR) - *state = 1; - break; - case 1: + case KDB_ALT_BREAK_SEEN_CR: + *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_TILDE) - *state = 2; + *state = KDB_ALT_BREAK_SEEN_CR_TILDE; break; - case 2: + case KDB_ALT_BREAK_SEEN_CR_TILDE: + *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_CRTLB) brk = KDB_REQ_DEBUGGER; else if (key == KEY_CRTLP) brk = KDB_REQ_PANIC; else if (key == KEY_CRTLR) brk = KDB_REQ_REBOOT; - *state = 0; + break; + case KDB_ALT_BREAK_SEEN_NONE: + default: + *state = KDB_ALT_BREAK_SEEN_NONE; + break; } return (brk); } diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index a7f280a..a4c0069 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -831,8 +831,11 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) /* security check done above */ p->p_flag |= P_TRACED; p->p_oppid = p->p_pptr->p_pid; - if (p->p_pptr != td->td_proc) + if (p->p_pptr != td->td_proc) { + /* Remember that a child is being debugged(traced). */ + p->p_pptr->p_dbg_child++; proc_reparent(p, td->td_proc); + } data = SIGSTOP; goto sendsig; /* in PT_CONTINUE below */ @@ -919,11 +922,12 @@ kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) PROC_UNLOCK(pp); PROC_LOCK(p); proc_reparent(p, pp); + p->p_pptr->p_dbg_child--; if (pp == initproc) p->p_sigparent = SIGCHLD; } - p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK); p->p_oppid = 0; + p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK); /* should we send SIGCHLD? */ /* childproc_continued(p); */ diff --git a/sys/net/if.h b/sys/net/if.h index d291da8..06521cb 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -199,6 +199,13 @@ struct if_data { * field. IFCAP_* and CSUM_* do not match one to one and CSUM_* may be * more detailed or differenciated than IFCAP_*. * Hwassist features are defined CSUM_* in sys/mbuf.h + * + * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl + * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE. + * This is not strictly necessary because the common code never + * changes capabilities, and it is left to the individual driver + * to do the right thing. However, having the filter here + * avoids replication of the same code in all individual drivers. */ #define IFCAP_RXCSUM 0x00001 /* can offload checksum on RX */ #define IFCAP_TXCSUM 0x00002 /* can offload checksum on TX */ @@ -220,12 +227,15 @@ struct if_data { #define IFCAP_POLLING_NOCOUNT 0x20000 /* polling ticks cannot be fragmented */ #define IFCAP_VLAN_HWTSO 0x40000 /* can do IFCAP_TSO on VLANs */ #define IFCAP_LINKSTATE 0x80000 /* the runtime link state is dynamic */ +#define IFCAP_NETMAP 0x100000 /* netmap mode supported/enabled */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) #define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) #define IFCAP_WOL (IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC) #define IFCAP_TOE (IFCAP_TOE4 | IFCAP_TOE6) +#define IFCAP_CANTCHANGE (IFCAP_NETMAP) + #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 999557a..72d36b1 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -373,14 +373,14 @@ rts_sockaddr(struct socket *so, struct sockaddr **nam) static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, - .pru_bind = rts_bind, - .pru_connect = rts_connect, + .pru_bind = raw_bind, + .pru_connect = raw_connect, .pru_detach = rts_detach, - .pru_disconnect = rts_disconnect, - .pru_peeraddr = rts_peeraddr, - .pru_send = rts_send, - .pru_shutdown = rts_shutdown, - .pru_sockaddr = rts_sockaddr, + .pru_disconnect = raw_disconnect, + .pru_peeraddr = raw_peeraddr, + .pru_send = raw_send, + .pru_shutdown = raw_shutdown, + .pru_sockaddr = raw_sockaddr, .pru_close = rts_close, }; diff --git a/sys/net80211/ieee80211_acl.c b/sys/net80211/ieee80211_acl.c index cb20b87..da505e3 100644 --- a/sys/net80211/ieee80211_acl.c +++ b/sys/net80211/ieee80211_acl.c @@ -77,7 +77,7 @@ struct acl { struct aclstate { acl_lock_t as_lock; int as_policy; - int as_nacls; + uint32_t as_nacls; TAILQ_HEAD(, acl) as_list; /* list of all ACL's */ LIST_HEAD(, acl) as_hash[ACL_HASHSIZE]; struct ieee80211vap *as_vap; @@ -289,7 +289,8 @@ acl_getioctl(struct ieee80211vap *vap, struct ieee80211req *ireq) struct aclstate *as = vap->iv_as; struct acl *acl; struct ieee80211req_maclist *ap; - int error, space, i; + int error; + uint32_t i, space; switch (ireq->i_val) { case IEEE80211_MACCMD_POLICY: diff --git a/sys/net80211/ieee80211_ioctl.c b/sys/net80211/ieee80211_ioctl.c index 37d5dbe..f8e1785 100644 --- a/sys/net80211/ieee80211_ioctl.c +++ b/sys/net80211/ieee80211_ioctl.c @@ -143,7 +143,7 @@ static __noinline int ieee80211_ioctl_getchaninfo(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211com *ic = vap->iv_ic; - int space; + uint32_t space; space = __offsetof(struct ieee80211req_chaninfo, ic_chans[ic->ic_nchans]); @@ -207,7 +207,7 @@ ieee80211_ioctl_getstastats(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211_node *ni; uint8_t macaddr[IEEE80211_ADDR_LEN]; - const int off = __offsetof(struct ieee80211req_sta_stats, is_stats); + const size_t off = __offsetof(struct ieee80211req_sta_stats, is_stats); int error; if (ireq->i_len < off) @@ -323,7 +323,7 @@ ieee80211_ioctl_getscanresults(struct ieee80211vap *vap, if (req.space > ireq->i_len) req.space = ireq->i_len; if (req.space > 0) { - size_t space; + uint32_t space; void *p; space = req.space; @@ -458,7 +458,7 @@ get_sta_info(void *arg, struct ieee80211_node *ni) static __noinline int getstainfo_common(struct ieee80211vap *vap, struct ieee80211req *ireq, - struct ieee80211_node *ni, int off) + struct ieee80211_node *ni, size_t off) { struct ieee80211com *ic = vap->iv_ic; struct stainforeq req; @@ -503,7 +503,7 @@ static __noinline int ieee80211_ioctl_getstainfo(struct ieee80211vap *vap, struct ieee80211req *ireq) { uint8_t macaddr[IEEE80211_ADDR_LEN]; - const int off = __offsetof(struct ieee80211req_sta_req, info); + const size_t off = __offsetof(struct ieee80211req_sta_req, info); struct ieee80211_node *ni; int error; diff --git a/sys/net80211/ieee80211_ioctl.h b/sys/net80211/ieee80211_ioctl.h index 7215a5e..cad5576 100644 --- a/sys/net80211/ieee80211_ioctl.h +++ b/sys/net80211/ieee80211_ioctl.h @@ -578,7 +578,7 @@ struct ieee80211req { char i_name[IFNAMSIZ]; /* if_name, e.g. "wi0" */ uint16_t i_type; /* req type */ int16_t i_val; /* Index or simple value */ - int16_t i_len; /* Index or simple value */ + uint16_t i_len; /* Index or simple value */ void *i_data; /* Extra data */ }; #define SIOCS80211 _IOW('i', 234, struct ieee80211req) diff --git a/sys/netinet/in.h b/sys/netinet/in.h index d5e4290..12473ff 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -253,6 +253,7 @@ __END_DECLS /* Only used internally, so can be outside the range of valid IP protocols. */ #define IPPROTO_DIVERT 258 /* divert pseudo-protocol */ #define IPPROTO_SEND 259 /* SeND pseudo-protocol */ +#define IPPROTO_ND6 260 /* IPv6 NDP */ /* * Defined to avoid confusion. The master value is defined by diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c index 49c48b9..9e5c737 100644 --- a/sys/netinet/ipfw/ip_fw2.c +++ b/sys/netinet/ipfw/ip_fw2.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/icmp6.h> #ifdef INET6 +#include <netinet6/in6_pcb.h> #include <netinet6/scope6_var.h> #include <netinet6/ip6_var.h> #endif @@ -646,21 +647,27 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) * we tried and failed, or any other value if successful. */ static int -check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, - struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, - u_int16_t src_port, int *ugid_lookupp, - struct ucred **uc, struct inpcb *inp) +check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, + struct ucred **uc) { #ifndef __FreeBSD__ + /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ + struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; + struct ipfw_flow_id *id; + struct inpcb *pcb, *inp; + struct ifnet *oif; int lookupflags; - struct inpcb *pcb; int match; + id = &args->f_id; + inp = args->inp; + oif = args->oif; + /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking @@ -681,10 +688,10 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, */ if (*ugid_lookupp == -1) return (0); - if (proto == IPPROTO_TCP) { + if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; - } else if (proto == IPPROTO_UDP) { + } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else @@ -692,19 +699,36 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { - /* - * XXXRW: If we had the mbuf here, could use - * in_pcblookup_mbuf(). - */ - pcb = (oif) ? - in_pcblookup(pi, - dst_ip, htons(dst_port), - src_ip, htons(src_port), - lookupflags, oif) : - in_pcblookup(pi, - src_ip, htons(src_port), - dst_ip, htons(dst_port), - lookupflags, NULL); + if (id->addr_type == 6) { +#ifdef INET6 + if (oif == NULL) + pcb = in6_pcblookup_mbuf(pi, + &id->src_ip6, htons(id->src_port), + &id->dst_ip6, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in6_pcblookup_mbuf(pi, + &id->dst_ip6, htons(id->dst_port), + &id->src_ip6, htons(id->src_port), + lookupflags, oif, args->m); +#else + *ugid_lookupp = -1; + return (0); +#endif + } else { + src_ip.s_addr = htonl(id->src_ip); + dst_ip.s_addr = htonl(id->dst_ip); + if (oif == NULL) + pcb = in_pcblookup_mbuf(pi, + src_ip, htons(id->src_port), + dst_ip, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in_pcblookup_mbuf(pi, + dst_ip, htons(id->dst_port), + src_ip, htons(id->src_port), + lookupflags, oif, args->m); + } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); @@ -719,14 +743,14 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, *ugid_lookupp = -1; return (0); } - } + } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); - return match; + return (match); #endif /* __FreeBSD__ */ } @@ -1264,22 +1288,17 @@ do { \ * as this ensures that we have a * packet with the ports info. */ - if (offset!=0) - break; - if (is_ipv6) /* XXX to be fixed later */ + if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, - proto, oif, - dst_ip, dst_port, - src_ip, src_port, &ucred_lookup, + args, &ucred_lookup, #ifdef __FreeBSD__ - &ucred_cache, args->inp); + &ucred_cache); #else - (void *)&ucred_cache, - (struct inpcb *)args->m); + (void *)&ucred_cache); #endif break; @@ -1394,18 +1413,15 @@ do { \ else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, - proto, oif, - dst_ip, dst_port, - src_ip, src_port, &ucred_lookup, + args, &ucred_lookup, #ifdef __FreeBSD__ - &ucred_cache, args->inp); + &ucred_cache); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ - (void *)&ucred_cache, - (struct inpcb *)args->m); + (void *)&ucred_cache); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) @@ -2178,6 +2194,13 @@ do { \ int nat_id; set_match(args, f_pos, chain); + /* Check if this is 'global' nat rule */ + if (cmd->arg1 == 0) { + retval = ipfw_nat_ptr(args, NULL, m); + l = 0; + done = 1; + break; + } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c index fd6f09a..1679a97 100644 --- a/sys/netinet/ipfw/ip_fw_nat.c +++ b/sys/netinet/ipfw/ip_fw_nat.c @@ -207,7 +207,8 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ - int ldt, retval; + int ldt, retval, found; + struct ip_fw_chain *chain; char *c; ldt = 0; @@ -256,12 +257,44 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) ldt = 1; c = mtod(mcl, char *); - if (args->oif == NULL) - retval = LibAliasIn(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); - else - retval = LibAliasOut(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); + + /* Check if this is 'global' instance */ + if (t == NULL) { + if (args->oif == NULL) { + /* Wrong direction, skip processing */ + args->m = mcl; + return (IP_FW_NAT); + } + + found = 0; + chain = &V_layer3_chain; + IPFW_RLOCK(chain); + /* Check every nat entry... */ + LIST_FOREACH(t, &chain->nat, _next) { + if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) + continue; + retval = LibAliasOutTry(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl), 0); + if (retval == PKT_ALIAS_OK) { + /* Nat instance recognises state */ + found = 1; + break; + } + } + IPFW_RUNLOCK(chain); + if (found != 1) { + /* No instance found, return ignore */ + args->m = mcl; + return (IP_FW_NAT); + } + } else { + if (args->oif == NULL) + retval = LibAliasIn(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + else + retval = LibAliasOut(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + } /* * We drop packet when: @@ -274,7 +307,7 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) if (retval == PKT_ALIAS_ERROR || (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (retval == PKT_ALIAS_IGNORED && - (t->lib->packetAliasMode & PKT_ALIAS_DENY_INCOMING) != 0)))) { + (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h index 2aed829..b12b353 100644 --- a/sys/netinet/libalias/alias.h +++ b/sys/netinet/libalias/alias.h @@ -197,6 +197,18 @@ struct mbuf *m_megapullup(struct mbuf *, int); */ #define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + #ifndef NO_FW_PUNCH /* * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will @@ -209,16 +221,10 @@ struct mbuf *m_megapullup(struct mbuf *, int); #endif /* - * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only - * transparent proxying is performed. - */ -#define PKT_ALIAS_PROXY_ONLY 0x40 - -/* - * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and - * PacketAliasOut() are reversed. + * If PKT_ALIAS_SKIP_GLOBAL is set, nat instance is not checked for matching + * states in 'ipfw nat global' rule. */ -#define PKT_ALIAS_REVERSE 0x80 +#define PKT_ALIAS_SKIP_GLOBAL 0x200 /* Function return codes. */ #define PKT_ALIAS_ERROR -1 diff --git a/sys/netinet/sctp.h b/sys/netinet/sctp.h index fa29a75..f496d26 100644 --- a/sys/netinet/sctp.h +++ b/sys/netinet/sctp.h @@ -91,7 +91,7 @@ struct sctp_paramhdr { #define SCTP_PEER_ADDR_PARAMS 0x0000000a #define SCTP_DEFAULT_SEND_PARAM 0x0000000b /* ancillary data/notification interest options */ -#define SCTP_EVENTS 0x0000000c +#define SCTP_EVENTS 0x0000000c /* deprecated */ /* Without this applied we will give V4 and V6 addresses on a V6 socket */ #define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d #define SCTP_MAXSEG 0x0000000e @@ -114,6 +114,10 @@ struct sctp_paramhdr { #define SCTP_EXPLICIT_EOR 0x0000001b #define SCTP_REUSE_PORT 0x0000001c /* rw */ #define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d +#define SCTP_EVENT 0x0000001e +#define SCTP_RECVRCVINFO 0x0000001f +#define SCTP_RECVNXTINFO 0x00000020 +#define SCTP_DEFAULT_SNDINFO 0x00000021 /* * read-only options @@ -490,7 +494,7 @@ struct sctp_error_unrecognized_chunk { /* * PCB Features (in sctp_features bitmask) */ -#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002 +#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002 /* deprecated */ #define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x00000004 #define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x00000008 #define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x00000010 @@ -500,7 +504,7 @@ struct sctp_error_unrecognized_chunk { /* socket options */ #define SCTP_PCB_FLAGS_NODELAY 0x00000100 #define SCTP_PCB_FLAGS_AUTOCLOSE 0x00000200 -#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400 +#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400 /* deprecated */ #define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x00000800 #define SCTP_PCB_FLAGS_RECVPADDREVNT 0x00001000 #define SCTP_PCB_FLAGS_RECVPEERERR 0x00002000 @@ -516,6 +520,9 @@ struct sctp_error_unrecognized_chunk { #define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x01000000 #define SCTP_PCB_FLAGS_PORTREUSE 0x02000000 #define SCTP_PCB_FLAGS_DRYEVNT 0x04000000 +#define SCTP_PCB_FLAGS_RECVRCVINFO 0x08000000 +#define SCTP_PCB_FLAGS_RECVNXTINFO 0x10000000 + /*- * mobility_features parameters (by micchie).Note * these features are applied against the diff --git a/sys/netinet/sctp_auth.c b/sys/netinet/sctp_auth.c index 91e3f78..b68c840 100644 --- a/sys/netinet/sctp_auth.c +++ b/sys/netinet/sctp_auth.c @@ -1866,7 +1866,7 @@ sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, /* If the socket is gone we are out of here */ return; } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTHEVNT)) + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_AUTHEVNT)) /* event not enabled */ return; diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index 155e55c..e142a3e 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -201,48 +201,110 @@ failed_build: struct mbuf * -sctp_build_ctl_nchunk(struct sctp_inpcb *inp, - struct sctp_sndrcvinfo *sinfo) +sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) { + struct sctp_extrcvinfo *seinfo; struct sctp_sndrcvinfo *outinfo; + struct sctp_rcvinfo *rcvinfo; + struct sctp_nxtinfo *nxtinfo; struct cmsghdr *cmh; struct mbuf *ret; int len; - int use_extended = 0; + int use_extended; + int provide_nxt; - if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { - /* user does not want the sndrcv ctl */ + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { + /* user does not want any ancillary data */ return (NULL); } - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { - use_extended = 1; - len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + len = 0; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { + len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); + } + seinfo = (struct sctp_extrcvinfo *)sinfo; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO) && + (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) { + provide_nxt = 1; + len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); } else { - len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + provide_nxt = 0; + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + use_extended = 1; + len += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); + } else { + use_extended = 0; + len += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); + } + } else { + use_extended = 0; } - - ret = sctp_get_mbuf_for_msg(len, - 0, M_DONTWAIT, 1, MT_DATA); - + ret = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA); if (ret == NULL) { /* No space */ return (ret); } - /* We need a CMSG header followed by the struct */ + SCTP_BUF_LEN(ret) = 0; + + /* We need a CMSG header followed by the struct */ cmh = mtod(ret, struct cmsghdr *); - outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); - cmh->cmsg_level = IPPROTO_SCTP; - if (use_extended) { - cmh->cmsg_type = SCTP_EXTRCV; - cmh->cmsg_len = len; - memcpy(outinfo, sinfo, len); - } else { - cmh->cmsg_type = SCTP_SNDRCV; - cmh->cmsg_len = len; - *outinfo = *sinfo; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { + cmh->cmsg_level = IPPROTO_SCTP; + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_rcvinfo)); + cmh->cmsg_type = SCTP_RCVINFO; + rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmh); + rcvinfo->rcv_sid = sinfo->sinfo_stream; + rcvinfo->rcv_ssn = sinfo->sinfo_ssn; + rcvinfo->rcv_flags = sinfo->sinfo_flags; + rcvinfo->rcv_ppid = sinfo->sinfo_ppid; + rcvinfo->rcv_tsn = sinfo->sinfo_tsn; + rcvinfo->rcv_cumtsn = sinfo->sinfo_cumtsn; + rcvinfo->rcv_context = sinfo->sinfo_context; + rcvinfo->rcv_assoc_id = sinfo->sinfo_assoc_id; + cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_rcvinfo))); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); + } + if (provide_nxt) { + cmh->cmsg_level = IPPROTO_SCTP; + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_nxtinfo)); + cmh->cmsg_type = SCTP_NXTINFO; + nxtinfo = (struct sctp_nxtinfo *)CMSG_DATA(cmh); + nxtinfo->nxt_sid = seinfo->sreinfo_next_stream; + nxtinfo->nxt_flags = 0; + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) { + nxtinfo->nxt_flags |= SCTP_UNORDERED; + } + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) { + nxtinfo->nxt_flags |= SCTP_NOTIFICATION; + } + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) { + nxtinfo->nxt_flags |= SCTP_COMPLETE; + } + nxtinfo->nxt_ppid = seinfo->sreinfo_next_ppid; + nxtinfo->nxt_length = seinfo->sreinfo_next_length; + nxtinfo->nxt_assoc_id = seinfo->sreinfo_next_aid; + cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_nxtinfo))); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_nxtinfo)); + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + cmh->cmsg_level = IPPROTO_SCTP; + outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); + if (use_extended) { + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + cmh->cmsg_type = SCTP_EXTRCV; + memcpy(outinfo, sinfo, sizeof(struct sctp_extrcvinfo)); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); + } else { + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + cmh->cmsg_type = SCTP_SNDRCV; + *outinfo = *sinfo; + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); + } } - SCTP_BUF_LEN(ret) = cmh->cmsg_len; return (ret); } diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index a7d22bd..043b3b2 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -3355,54 +3355,338 @@ sctp_source_address_selection(struct sctp_inpcb *inp, } static int -sctp_find_cmsg(int c_type, void *data, struct mbuf *control, int cpsize) +sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize) { struct cmsghdr cmh; - int tlen, at; + int tlen, at, found; + struct sctp_sndinfo sndinfo; + struct sctp_prinfo prinfo; + struct sctp_authinfo authinfo; tlen = SCTP_BUF_LEN(control); at = 0; + found = 0; /* * Independent of how many mbufs, find the c_type inside the control * structure and copy out the data. */ while (at < tlen) { if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { - /* not enough room for one more we are done. */ - return (0); + /* There is not enough room for one more. */ + return (found); } m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + return (found); + } if (((int)cmh.cmsg_len + at) > tlen) { - /* - * this is real messed up since there is not enough - * data here to cover the cmsg header. We are done. - */ - return (0); + /* We don't have the complete CMSG. */ + return (found); } if ((cmh.cmsg_level == IPPROTO_SCTP) && - (c_type == cmh.cmsg_type)) { - /* found the one we want, copy it out */ - at += CMSG_ALIGN(sizeof(struct cmsghdr)); - if ((int)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) { - /* - * space of cmsg_len after header not big - * enough - */ - return (0); + ((c_type == cmh.cmsg_type) || + ((c_type == SCTP_SNDRCV) && + ((cmh.cmsg_type == SCTP_SNDINFO) || + (cmh.cmsg_type == SCTP_PRINFO) || + (cmh.cmsg_type == SCTP_AUTHINFO))))) { + if (c_type == cmh.cmsg_type) { + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) { + return (found); + } + /* It is exactly what we want. Copy it out. */ + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), cpsize, (caddr_t)data); + return (1); + } else { + struct sctp_sndrcvinfo *sndrcvinfo; + + sndrcvinfo = (struct sctp_sndrcvinfo *)data; + if (found == 0) { + if (cpsize < sizeof(struct sctp_sndrcvinfo)) { + return (found); + } + memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo)); + } + switch (cmh.cmsg_type) { + case SCTP_SNDINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_sndinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_sndinfo), (caddr_t)&sndinfo); + sndrcvinfo->sinfo_stream = sndinfo.snd_sid; + sndrcvinfo->sinfo_flags = sndinfo.snd_flags; + sndrcvinfo->sinfo_ppid = sndinfo.snd_ppid; + sndrcvinfo->sinfo_context = sndinfo.snd_context; + sndrcvinfo->sinfo_assoc_id = sndinfo.snd_assoc_id; + break; + case SCTP_PRINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_prinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_prinfo), (caddr_t)&prinfo); + sndrcvinfo->sinfo_timetolive = prinfo.pr_value; + sndrcvinfo->sinfo_flags |= prinfo.pr_policy; + break; + case SCTP_AUTHINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_authinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_authinfo), (caddr_t)&authinfo); + sndrcvinfo->sinfo_keynumber_valid = 1; + sndrcvinfo->sinfo_keynumber = authinfo.auth_keyid; + break; + default: + return (found); + } + found = 1; } - m_copydata(control, at, cpsize, data); + } + at += CMSG_ALIGN(cmh.cmsg_len); + } + return (found); +} + +static int +sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) +{ + struct cmsghdr cmh; + int tlen, at; + struct sctp_initmsg initmsg; + +#ifdef INET + struct sockaddr_in sin; + +#endif +#ifdef INET6 + struct sockaddr_in6 sin6; + +#endif + + tlen = SCTP_BUF_LEN(control); + at = 0; + while (at < tlen) { + if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + /* There is not enough room for one more. */ + *error = EINVAL; return (1); - } else { - at += CMSG_ALIGN(cmh.cmsg_len); - if (cmh.cmsg_len == 0) { + } + m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + *error = EINVAL; + return (1); + } + if (((int)cmh.cmsg_len + at) > tlen) { + /* We don't have the complete CMSG. */ + *error = EINVAL; + return (1); + } + if (cmh.cmsg_level == IPPROTO_SCTP) { + switch (cmh.cmsg_type) { + case SCTP_INIT: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_initmsg)) { + *error = EINVAL; + return (1); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_initmsg), (caddr_t)&initmsg); + if (initmsg.sinit_max_attempts) + stcb->asoc.max_init_times = initmsg.sinit_max_attempts; + if (initmsg.sinit_num_ostreams) + stcb->asoc.pre_open_streams = initmsg.sinit_num_ostreams; + if (initmsg.sinit_max_instreams) + stcb->asoc.max_inbound_streams = initmsg.sinit_max_instreams; + if (initmsg.sinit_max_init_timeo) + stcb->asoc.initial_init_rto_max = initmsg.sinit_max_init_timeo; + if (stcb->asoc.streamoutcnt < stcb->asoc.pre_open_streams) { + struct sctp_stream_out *tmp_str; + unsigned int i; + + /* Default is NOT correct */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n", + stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams); + SCTP_TCB_UNLOCK(stcb); + SCTP_MALLOC(tmp_str, + struct sctp_stream_out *, + (stcb->asoc.pre_open_streams * sizeof(struct sctp_stream_out)), + SCTP_M_STRMO); + SCTP_TCB_LOCK(stcb); + if (tmp_str != NULL) { + SCTP_FREE(stcb->asoc.strmout, SCTP_M_STRMO); + stcb->asoc.strmout = tmp_str; + stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt = stcb->asoc.pre_open_streams; + } else { + stcb->asoc.pre_open_streams = stcb->asoc.streamoutcnt; + } + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].next_sequence_sent = 0; + TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); + stcb->asoc.strmout[i].stream_no = i; + stcb->asoc.strmout[i].last_msg_incomplete = 0; + stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL); + } + } + break; +#ifdef INET + case SCTP_DSTADDRV4: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in_addr)) { + *error = EINVAL; + return (1); + } + memset(&sin, 0, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_port = stcb->rport; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + if ((sin.sin_addr.s_addr == INADDR_ANY) || + (sin.sin_addr.s_addr == INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + *error = EINVAL; + return (-1); + } + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + break; +#endif +#ifdef INET6 + case SCTP_DSTADDRV6: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in6_addr)) { + *error = EINVAL; + return (1); + } + memset(&sin6, 0, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_port = stcb->rport; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { + *error = EINVAL; + return (-1); + } +#ifdef INET + if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { + in6_sin6_2_sin(&sin, &sin6); + if ((sin.sin_addr.s_addr == INADDR_ANY) || + (sin.sin_addr.s_addr == INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + *error = EINVAL; + return (-1); + } + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + } else +#endif + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + break; +#endif + default: break; } } + at += CMSG_ALIGN(cmh.cmsg_len); } - /* not found */ return (0); } +static struct sctp_tcb * +sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, + in_port_t port, + struct mbuf *control, + struct sctp_nets **net_p, + int *error) +{ + struct cmsghdr cmh; + int tlen, at; + struct sctp_tcb *stcb; + struct sockaddr *addr; + +#ifdef INET + struct sockaddr_in sin; + +#endif +#ifdef INET6 + struct sockaddr_in6 sin6; + +#endif + + tlen = SCTP_BUF_LEN(control); + at = 0; + while (at < tlen) { + if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + /* There is not enough room for one more. */ + *error = EINVAL; + return (NULL); + } + m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + *error = EINVAL; + return (NULL); + } + if (((int)cmh.cmsg_len + at) > tlen) { + /* We don't have the complete CMSG. */ + *error = EINVAL; + return (NULL); + } + if (cmh.cmsg_level == IPPROTO_SCTP) { + switch (cmh.cmsg_type) { +#ifdef INET + case SCTP_DSTADDRV4: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in_addr)) { + *error = EINVAL; + return (NULL); + } + memset(&sin, 0, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_port = port; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + addr = (struct sockaddr *)&sin; + break; +#endif +#ifdef INET6 + case SCTP_DSTADDRV6: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in6_addr)) { + *error = EINVAL; + return (NULL); + } + memset(&sin6, 0, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_port = port; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); +#ifdef INET + if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { + in6_sin6_2_sin(&sin, &sin6); + addr = (struct sockaddr *)&sin; + } else +#endif + addr = (struct sockaddr *)&sin6; + break; +#endif + default: + addr = NULL; + break; + } + if (addr) { + stcb = sctp_findassociation_ep_addr(inp_p, addr, net_p, NULL, NULL); + if (stcb != NULL) { + return (stcb); + } + } + } + at += CMSG_ALIGN(cmh.cmsg_len); + } + return (NULL); +} + static struct mbuf * sctp_add_cookie(struct sctp_inpcb *inp, struct mbuf *init, int init_offset, struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t ** signature) @@ -5989,19 +6273,26 @@ sctp_msg_append(struct sctp_tcb *stcb, sp->some_taken = 0; sp->data = m; sp->tail_mbuf = NULL; - sp->length = 0; - at = m; sctp_set_prsctp_policy(sp); /* * We could in theory (for sendall) sifa the length in, but we would * still have to hunt through the chain since we need to setup the * tail_mbuf */ - while (at) { + sp->length = 0; + for (at = m; at; at = SCTP_BUF_NEXT(at)) { if (SCTP_BUF_NEXT(at) == NULL) sp->tail_mbuf = at; sp->length += SCTP_BUF_LEN(at); - at = SCTP_BUF_NEXT(at); + } + if (srcv->sinfo_keynumber_valid) { + sp->auth_keyid = srcv->sinfo_keynumber; + } else { + sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + } + if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { + sctp_auth_key_acquire(stcb, sp->auth_keyid); + sp->holds_key_ref = 1; } SCTP_TCB_SEND_LOCK(stcb); sctp_snd_sb_alloc(stcb, sp->length); @@ -6478,7 +6769,9 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, memset(ca, 0, sizeof(struct sctp_copy_all)); ca->inp = inp; - memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); + if (srcv) { + memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); + } /* * take off the sendall flag, it would be bad if we failed to do * this :-0 @@ -12229,9 +12522,13 @@ sctp_copy_it_in(struct sctp_tcb *stcb, *error = 0; goto skip_copy; } - sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + if (srcv->sinfo_keynumber_valid) { + sp->auth_keyid = srcv->sinfo_keynumber; + } else { + sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { - sctp_auth_key_acquire(stcb, stcb->asoc.authinfo.active_keyid); + sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } *error = sctp_copy_one(sp, uio, resv_in_first); @@ -12263,8 +12560,8 @@ sctp_sosend(struct socket *so, struct thread *p ) { - int error, use_rcvinfo = 0; - struct sctp_sndrcvinfo srcv; + int error, use_sndinfo = 0; + struct sctp_sndrcvinfo sndrcvninfo; struct sockaddr *addr_to_use; #if defined(INET) && defined(INET6) @@ -12274,10 +12571,10 @@ sctp_sosend(struct socket *so, if (control) { /* process cmsg snd/rcv info (maybe a assoc-id) */ - if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&srcv, control, - sizeof(srcv))) { + if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&sndrcvninfo, control, + sizeof(sndrcvninfo))) { /* got one */ - use_rcvinfo = 1; + use_sndinfo = 1; } } addr_to_use = addr; @@ -12295,7 +12592,7 @@ sctp_sosend(struct socket *so, error = sctp_lower_sosend(so, addr_to_use, uio, top, control, flags, - use_rcvinfo ? &srcv : NULL + use_sndinfo ? &sndrcvninfo : NULL ,p ); return (error); @@ -12500,6 +12797,9 @@ sctp_lower_sosend(struct socket *so, SCTP_INP_WUNLOCK(inp); /* With the lock applied look again */ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); + if ((stcb == NULL) && (control != NULL) && (port > 0)) { + stcb = sctp_findassociation_cmsgs(&t_inp, port, control, &net, &error); + } if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); @@ -12507,6 +12807,9 @@ sctp_lower_sosend(struct socket *so, } else { hold_tcblock = 1; } + if (error) { + goto out_unlocked; + } if (t_inp != inp) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; @@ -12555,6 +12858,7 @@ sctp_lower_sosend(struct socket *so, /* Error is setup for us in the call */ goto out_unlocked; } + hold_tcblock = 1; if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; @@ -12574,84 +12878,13 @@ sctp_lower_sosend(struct socket *so, sctp_initialize_auth_params(inp, stcb); if (control) { - /* - * see if a init structure exists in cmsg - * headers - */ - struct sctp_initmsg initm; - int i; - - if (sctp_find_cmsg(SCTP_INIT, (void *)&initm, control, - sizeof(initm))) { - /* - * we have an INIT override of the - * default - */ - if (initm.sinit_max_attempts) - asoc->max_init_times = initm.sinit_max_attempts; - if (initm.sinit_num_ostreams) - asoc->pre_open_streams = initm.sinit_num_ostreams; - if (initm.sinit_max_instreams) - asoc->max_inbound_streams = initm.sinit_max_instreams; - if (initm.sinit_max_init_timeo) - asoc->initial_init_rto_max = initm.sinit_max_init_timeo; - if (asoc->streamoutcnt < asoc->pre_open_streams) { - struct sctp_stream_out *tmp_str; - int had_lock = 0; - - /* Default is NOT correct */ - SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, defout:%d pre_open:%d\n", - asoc->streamoutcnt, asoc->pre_open_streams); - /* - * What happens if this - * fails? we panic ... - */ - - if (hold_tcblock) { - had_lock = 1; - SCTP_TCB_UNLOCK(stcb); - } - SCTP_MALLOC(tmp_str, - struct sctp_stream_out *, - (asoc->pre_open_streams * - sizeof(struct sctp_stream_out)), - SCTP_M_STRMO); - if (had_lock) { - SCTP_TCB_LOCK(stcb); - } - if (tmp_str != NULL) { - SCTP_FREE(asoc->strmout, SCTP_M_STRMO); - asoc->strmout = tmp_str; - asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams; - } else { - asoc->pre_open_streams = asoc->streamoutcnt; - } - for (i = 0; i < asoc->streamoutcnt; i++) { - /*- - * inbound side must be set - * to 0xffff, also NOTE when - * we get the INIT-ACK back - * (for INIT sender) we MUST - * reduce the count - * (streamoutcnt) but first - * check if we sent to any - * of the upper streams that - * were dropped (if some - * were). Those that were - * dropped must be notified - * to the upper layer as - * failed to send. - */ - asoc->strmout[i].next_sequence_sent = 0x0; - TAILQ_INIT(&asoc->strmout[i].outqueue); - asoc->strmout[i].stream_no = i; - asoc->strmout[i].last_msg_incomplete = 0; - asoc->ss_functions.sctp_ss_init_stream(&asoc->strmout[i], NULL); - } - } + if (sctp_process_cmsgs_for_init(stcb, control, &error)) { + sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_7); + hold_tcblock = 0; + stcb = NULL; + goto out_unlocked; } } - hold_tcblock = 1; /* out with the INIT */ queue_only_for_init = 1; /*- diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index e53e28a..8dc01cd 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -4196,11 +4196,11 @@ try_again: return (0); } /* - * We don't allow assoc id to be 0, this is needed otherwise if the - * id were to wrap we would have issues with some socket options. + * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC, + * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC. */ - if (inp->sctp_associd_counter == 0) { - inp->sctp_associd_counter++; + if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) { + inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1; } id = inp->sctp_associd_counter; inp->sctp_associd_counter++; @@ -4793,7 +4793,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre /* Held for PD-API clear that. */ sq->pdapi_aborted = 1; sq->held_length = 0; - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { + if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { /* * Need to add a PD-API * aborted indication. diff --git a/sys/netinet/sctp_structs.h b/sys/netinet/sctp_structs.h index 250b312..0f9bcaf 100644 --- a/sys/netinet/sctp_structs.h +++ b/sys/netinet/sctp_structs.h @@ -647,6 +647,8 @@ struct sctp_nonpad_sndrcvinfo { uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; }; /* @@ -1201,6 +1203,7 @@ struct sctp_association { /* JRS 5/21/07 - CMT PF variable */ uint8_t sctp_cmt_pf; uint8_t use_precise_time; + uint32_t sctp_features; /* * The mapping array is used to track out of order sequences above * last_acked_seq. 0 indicates packet missing 1 indicates packet diff --git a/sys/netinet/sctp_uio.h b/sys/netinet/sctp_uio.h index 56aef9d..e7f2daf 100644 --- a/sys/netinet/sctp_uio.h +++ b/sys/netinet/sctp_uio.h @@ -47,6 +47,16 @@ __FBSDID("$FreeBSD$"); typedef uint32_t sctp_assoc_t; +#define SCTP_FUTURE_ASSOC 0 +#define SCTP_CURRENT_ASSOC 1 +#define SCTP_ALL_ASSOC 2 + +struct sctp_event { + sctp_assoc_t se_assoc_id; + uint16_t se_type; + uint8_t se_on; +}; + /* Compatibility to previous define's */ #define sctp_stream_reset_events sctp_stream_reset_event @@ -69,6 +79,14 @@ struct sctp_event_subscribe { #define SCTP_INIT 0x0001 #define SCTP_SNDRCV 0x0002 #define SCTP_EXTRCV 0x0003 +#define SCTP_SNDINFO 0x0004 +#define SCTP_RCVINFO 0x0005 +#define SCTP_NXTINFO 0x0006 +#define SCTP_PRINFO 0x0007 +#define SCTP_AUTHINFO 0x0008 +#define SCTP_DSTADDRV4 0x0009 +#define SCTP_DSTADDRV6 0x000a + /* * ancillary data structures */ @@ -93,8 +111,8 @@ struct sctp_initmsg { */ -#define SCTP_ALIGN_RESV_PAD 96 -#define SCTP_ALIGN_RESV_PAD_SHORT 80 +#define SCTP_ALIGN_RESV_PAD 92 +#define SCTP_ALIGN_RESV_PAD_SHORT 76 struct sctp_sndrcvinfo { uint16_t sinfo_stream; @@ -106,6 +124,8 @@ struct sctp_sndrcvinfo { uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD]; }; @@ -113,7 +133,6 @@ struct sctp_extrcvinfo { uint16_t sinfo_stream; uint16_t sinfo_ssn; uint16_t sinfo_flags; - uint16_t sinfo_pr_policy; uint32_t sinfo_ppid; uint32_t sinfo_context; uint32_t sinfo_timetolive; @@ -125,15 +144,79 @@ struct sctp_extrcvinfo { uint32_t sreinfo_next_aid; uint32_t sreinfo_next_length; uint32_t sreinfo_next_ppid; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT]; }; +struct sctp_sndinfo { + uint16_t snd_sid; + uint16_t snd_flags; + uint32_t snd_ppid; + uint32_t snd_context; + sctp_assoc_t snd_assoc_id; +}; + +struct sctp_prinfo { + uint16_t pr_policy; + uint32_t pr_value; +}; + +struct sctp_authinfo { + uint16_t auth_keyid; +}; + +struct sctp_rcvinfo { + uint16_t rcv_sid; + uint16_t rcv_ssn; + uint16_t rcv_flags; + uint32_t rcv_ppid; + uint32_t rcv_tsn; + uint32_t rcv_cumtsn; + uint32_t rcv_context; + sctp_assoc_t rcv_assoc_id; +}; + +struct sctp_nxtinfo { + uint16_t nxt_sid; + uint16_t nxt_flags; + uint32_t nxt_ppid; + uint32_t nxt_length; + sctp_assoc_t nxt_assoc_id; +}; + #define SCTP_NO_NEXT_MSG 0x0000 #define SCTP_NEXT_MSG_AVAIL 0x0001 #define SCTP_NEXT_MSG_ISCOMPLETE 0x0002 #define SCTP_NEXT_MSG_IS_UNORDERED 0x0004 #define SCTP_NEXT_MSG_IS_NOTIFICATION 0x0008 +struct sctp_recvv_rn { + struct sctp_rcvinfo recvv_rcvinfo; + struct sctp_nxtinfo recvv_nxtinfo; +}; + +#define SCTP_RECVV_NOINFO 0 +#define SCTP_RECVV_RCVINFO 1 +#define SCTP_RECVV_NXTINFO 2 +#define SCTP_RECVV_RN 3 + +#define SCTP_SENDV_SNDINFO 1 +#define SCTP_SENDV_PRINFO 2 +#define SCTP_SENDV_AUTHINFO 3 +#define SCTP_SENDV_SPA 4 + +struct sctp_sendv_spa { + uint32_t sendv_flags; + struct sctp_sndinfo sendv_sndinfo; + struct sctp_prinfo sendv_prinfo; + struct sctp_authinfo sendv_authinfo; +}; + +#define SCTP_SEND_SNDINFO_VALID 0x00000001 +#define SCTP_SEND_PRINFO_VALID 0x00000002 +#define SCTP_SEND_AUTHINFO_VALID 0x00000004 + struct sctp_snd_all_completes { uint16_t sall_stream; uint16_t sall_flags; @@ -144,6 +227,8 @@ struct sctp_snd_all_completes { }; /* Flags that go into the sinfo->sinfo_flags field */ +#define SCTP_NOTIFICATION 0x0010 /* next message is a notification */ +#define SCTP_COMPLETE 0x0020 /* next message is complete */ #define SCTP_EOF 0x0100 /* Start shutdown procedures */ #define SCTP_ABORT 0x0200 /* Send an ABORT to peer */ #define SCTP_UNORDERED 0x0400 /* Message is un-ordered */ @@ -152,7 +237,7 @@ struct sctp_snd_all_completes { #define SCTP_EOR 0x2000 /* end of message signal */ #define SCTP_SACK_IMMEDIATELY 0x4000 /* Set I-Bit */ -#define INVALID_SINFO_FLAG(x) (((x) & 0xffffff00 \ +#define INVALID_SINFO_FLAG(x) (((x) & 0xfffffff0 \ & ~(SCTP_EOF | SCTP_ABORT | SCTP_UNORDERED |\ SCTP_ADDR_OVER | SCTP_SENDALL | SCTP_EOR |\ SCTP_SACK_IMMEDIATELY)) != 0) @@ -163,7 +248,7 @@ struct sctp_snd_all_completes { #define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */ #define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */ -#define PR_SCTP_POLICY(x) ((x) & 0xff) +#define PR_SCTP_POLICY(x) ((x) & 0x0f) #define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != 0) #define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL) #define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF) @@ -1132,26 +1217,34 @@ int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **)); void sctp_freeladdrs __P((struct sockaddr *)); int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *)); +/* deprecated */ ssize_t sctp_sendmsg -__P((int, const void *, size_t, - const struct sockaddr *, +__P((int, const void *, size_t, const struct sockaddr *, socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); - ssize_t sctp_send __P((int sd, const void *msg, size_t len, - const struct sctp_sndrcvinfo *sinfo, int flags)); +/* deprecated */ + ssize_t sctp_send __P((int, const void *, size_t, + const struct sctp_sndrcvinfo *, int)); + +/* deprecated */ + ssize_t sctp_sendx __P((int, const void *, size_t, struct sockaddr *, + int, struct sctp_sndrcvinfo *, int)); + +/* deprecated */ + ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, struct sockaddr *, + int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); - ssize_t sctp_sendx __P((int sd, const void *msg, size_t len, - struct sockaddr *addrs, int addrcnt, - struct sctp_sndrcvinfo *sinfo, int flags)); + sctp_assoc_t sctp_getassocid __P((int, struct sockaddr *)); - ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, - struct sockaddr *, int, - uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); +/* deprecated */ + ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, socklen_t *, + struct sctp_sndrcvinfo *, int *)); - sctp_assoc_t sctp_getassocid __P((int sd, struct sockaddr *sa)); + ssize_t sctp_sendv __P((int, const struct iovec *, int, struct sockaddr *, + int, void *, socklen_t, unsigned int, int)); - ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, - socklen_t *, struct sctp_sndrcvinfo *, int *)); + ssize_t sctp_recvv __P((int, const struct iovec *, int, struct sockaddr *, + socklen_t *, void *, socklen_t *, unsigned int *, int *)); __END_DECLS diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c index b3eb805..2fcf306 100644 --- a/sys/netinet/sctp_usrreq.c +++ b/sys/netinet/sctp_usrreq.c @@ -713,7 +713,7 @@ sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, control = NULL; } error = EDESTADDRREQ; - return EDESTADDRREQ; + return (error); } #endif /* INET6 */ connected_type: @@ -1448,7 +1448,6 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, struct sctp_tcb *stcb = NULL; struct sockaddr *sa; int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; - int added = 0; uint32_t vrf_id; int bad_addresses = 0; sctp_assoc_t *a_id; @@ -1560,7 +1559,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, } error = 0; - added = sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); + sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); /* Fill in the return id */ if (error) { (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6); @@ -1603,7 +1602,7 @@ out_now: SCTP_TCB_LOCK(stcb); \ } \ SCTP_INP_RUNLOCK(inp); \ - } else if (assoc_id != 0) { \ + } else if (assoc_id > SCTP_ALL_ASSOC) { \ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \ if (stcb == NULL) { \ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \ @@ -1691,10 +1690,6 @@ sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } /* end switch (sopt->sopt_name) */ - if (optname != SCTP_AUTOCLOSE) { - /* make it an "on/off" value */ - val = (val != 0); - } if (*optsize < sizeof(val)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; @@ -1734,8 +1729,8 @@ flags_out: SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; @@ -1743,8 +1738,8 @@ flags_out: SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = inp->partial_delivery_point; *optsize = sizeof(uint32_t); + break; } - break; case SCTP_FRAGMENT_INTERLEAVE: { uint32_t *value; @@ -1760,8 +1755,8 @@ flags_out: *value = SCTP_FRAG_LEVEL_0; } *optsize = sizeof(uint32_t); + break; } - break; case SCTP_CMT_ON_OFF: { struct sctp_assoc_value *av; @@ -1772,14 +1767,20 @@ flags_out: av->assoc_value = stcb->asoc.sctp_cmt_on_off; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_cmt_on_off; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_cmt_on_off; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*av); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; - /* JRS - Get socket option for pluggable congestion control */ case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; @@ -1790,11 +1791,20 @@ flags_out: av->assoc_value = stcb->asoc.congestion_control_module; SCTP_TCB_UNLOCK(stcb); } else { - av->assoc_value = inp->sctp_ep.sctp_default_cc_module; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.sctp_default_cc_module; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; @@ -1807,15 +1817,13 @@ flags_out: if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; } else { - error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, - cc_opt); - *optsize = sizeof(*cc_opt); + error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, cc_opt); + *optsize = sizeof(struct sctp_cc_option); } SCTP_TCB_UNLOCK(stcb); } + break; } - break; - /* RS - Get socket option for pluggable stream scheduling */ case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; @@ -1826,11 +1834,20 @@ flags_out: av->assoc_value = stcb->asoc.stream_scheduling_module; SCTP_TCB_UNLOCK(stcb); } else { - av->assoc_value = inp->sctp_ep.sctp_default_ss_module; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.sctp_default_ss_module; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_SS_VALUE: { struct sctp_stream_value *av; @@ -1843,7 +1860,7 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { - *optsize = sizeof(*av); + *optsize = sizeof(struct sctp_stream_value); } SCTP_TCB_UNLOCK(stcb); } else { @@ -1854,8 +1871,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } + break; } - break; case SCTP_GET_ADDR_LEN: { struct sctp_assoc_value *av; @@ -1876,10 +1893,11 @@ flags_out: #endif if (error) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_GET_ASSOC_NUMBER: { uint32_t *value, cnt; @@ -1893,9 +1911,8 @@ flags_out: SCTP_INP_RUNLOCK(inp); *value = cnt; *optsize = sizeof(uint32_t); + break; } - break; - case SCTP_GET_ASSOC_ID_LIST: { struct sctp_assoc_ids *ids; @@ -1915,10 +1932,12 @@ flags_out: } } SCTP_INP_RUNLOCK(inp); - ids->gaids_number_of_ids = at; - *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); + if (error == 0) { + ids->gaids_number_of_ids = at; + *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); + } + break; } - break; case SCTP_CONTEXT: { struct sctp_assoc_value *av; @@ -1930,19 +1949,27 @@ flags_out: av->assoc_value = stcb->asoc.context; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_context; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_context; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_VRF_ID: { uint32_t *default_vrfid; SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize); *default_vrfid = inp->def_vrf_id; + *optsize = sizeof(uint32_t); break; } case SCTP_GET_ASOC_VRF: @@ -1954,9 +1981,10 @@ flags_out: if (stcb == NULL) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); - break; + } else { + id->assoc_value = stcb->asoc.vrf_id; + *optsize = sizeof(struct sctp_assoc_value); } - id->assoc_value = stcb->asoc.vrf_id; break; } case SCTP_GET_VRF_IDS: @@ -1976,13 +2004,13 @@ flags_out: gnv->gn_peers_tag = stcb->asoc.peer_vtag; gnv->gn_local_tag = stcb->asoc.my_vtag; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_get_nonce_values); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(*gnv); + break; } - break; case SCTP_DELAYED_SACK: { struct sctp_sack_info *sack; @@ -1994,15 +2022,21 @@ flags_out: sack->sack_freq = stcb->asoc.sack_freq; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); - sack->sack_freq = inp->sctp_ep.sctp_sack_freq; - SCTP_INP_RUNLOCK(inp); + if (sack->sack_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); + sack->sack_freq = inp->sctp_ep.sctp_sack_freq; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_sack_info); } - *optsize = sizeof(*sack); + break; } - break; - case SCTP_GET_SNDBUF_USE: { struct sctp_sockstat *ss; @@ -2015,13 +2049,13 @@ flags_out: ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue + stcb->asoc.size_on_all_streams); SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_sockstat); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(struct sctp_sockstat); + break; } - break; case SCTP_MAX_BURST: { struct sctp_assoc_value *av; @@ -2033,14 +2067,20 @@ flags_out: av->assoc_value = stcb->asoc.max_burst; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_ep.max_burst; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.max_burst; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(struct sctp_assoc_value); - + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_MAXSEG: { struct sctp_assoc_value *av; @@ -2053,21 +2093,28 @@ flags_out: av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc); SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MED_OVERHEAD; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) + av->assoc_value = 0; + else + av->assoc_value = inp->sctp_frag_point - ovh; + SCTP_INP_RUNLOCK(inp); } else { - ovh = SCTP_MED_V4_OVERHEAD; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) - av->assoc_value = 0; - else - av->assoc_value = inp->sctp_frag_point - ovh; - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_assoc_value); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_GET_STAT_LOG: error = sctp_fill_stat_log(optval, optsize); break; @@ -2076,7 +2123,7 @@ flags_out: struct sctp_event_subscribe *events; SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize); - memset(events, 0, sizeof(*events)); + memset(events, 0, sizeof(struct sctp_event_subscribe)); SCTP_INP_RLOCK(inp); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) events->sctp_data_io_event = 1; @@ -2112,9 +2159,8 @@ flags_out: events->sctp_stream_reset_event = 1; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(struct sctp_event_subscribe); + break; } - break; - case SCTP_ADAPTATION_LAYER: { uint32_t *value; @@ -2125,8 +2171,8 @@ flags_out: *value = inp->sctp_ep.adaptation_layer_indicator; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_SET_INITIAL_DBG_SEQ: { uint32_t *value; @@ -2136,8 +2182,8 @@ flags_out: *value = inp->sctp_ep.initial_sequence_debug; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_LOCAL_ADDR_SIZE: { uint32_t *value; @@ -2147,8 +2193,8 @@ flags_out: *value = sctp_count_max_addresses(inp); SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_REMOTE_ADDR_SIZE: { uint32_t *value; @@ -2184,13 +2230,13 @@ flags_out: } SCTP_TCB_UNLOCK(stcb); *value = (uint32_t) size; + *optsize = sizeof(uint32_t); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_PEER_ADDRESSES: /* * Get the address information, an array is passed in to @@ -2260,8 +2306,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } + break; } - break; case SCTP_GET_LOCAL_ADDRESSES: { size_t limit, actual; @@ -2278,8 +2324,8 @@ flags_out: SCTP_TCB_UNLOCK(stcb); } *optsize = sizeof(struct sockaddr_storage) + actual; + break; } - break; case SCTP_PEER_ADDR_PARAMS: { struct sctp_paddrparams *paddrp; @@ -2416,38 +2462,45 @@ flags_out: paddrp->spp_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); } else { - /* Use endpoint defaults */ - SCTP_INP_RLOCK(inp); - paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; - paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); - paddrp->spp_assoc_id = (sctp_assoc_t) 0; - /* get inp's default */ + if (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC) { + /* Use endpoint defaults */ + SCTP_INP_RLOCK(inp); + paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; + paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); + paddrp->spp_assoc_id = SCTP_FUTURE_ASSOC; + /* get inp's default */ #ifdef INET - paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos; - paddrp->spp_flags |= SPP_IPV4_TOS; + paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos; + paddrp->spp_flags |= SPP_IPV4_TOS; #endif #ifdef INET6 - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; - paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; - } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; + paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; + } #endif - /* can't return this */ - paddrp->spp_pathmtu = 0; + /* can't return this */ + paddrp->spp_pathmtu = 0; - /* default behavior, no stcb */ - paddrp->spp_flags = SPP_PMTUD_ENABLE; + /* default behavior, no stcb */ + paddrp->spp_flags = SPP_PMTUD_ENABLE; - if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { - paddrp->spp_flags |= SPP_HB_ENABLE; + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { + paddrp->spp_flags |= SPP_HB_ENABLE; + } else { + paddrp->spp_flags |= SPP_HB_DISABLE; + } + SCTP_INP_RUNLOCK(inp); } else { - paddrp->spp_flags |= SPP_HB_DISABLE; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_paddrparams); + if (error == 0) { + *optsize = sizeof(struct sctp_paddrparams); + } + break; } - break; case SCTP_GET_PEER_ADDR_INFO: { struct sctp_paddrinfo *paddri; @@ -2491,6 +2544,7 @@ flags_out: paddri->spinfo_assoc_id = sctp_get_associd(stcb); paddri->spinfo_mtu = net->mtu; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_paddrinfo); } else { if (stcb) { SCTP_TCB_UNLOCK(stcb); @@ -2498,9 +2552,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } - *optsize = sizeof(struct sctp_paddrinfo); + break; } - break; case SCTP_PCB_STATUS: { struct sctp_pcbinfo *spcb; @@ -2508,9 +2561,8 @@ flags_out: SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize); sctp_fill_pcbinfo(spcb); *optsize = sizeof(struct sctp_pcbinfo); + break; } - break; - case SCTP_STATUS: { struct sctp_nets *net; @@ -2520,7 +2572,7 @@ flags_out: SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id); if (stcb == NULL) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } @@ -2569,9 +2621,9 @@ flags_out: sstat->sstat_primary.spinfo_mtu = net->mtu; sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); - *optsize = sizeof(*sstat); + *optsize = sizeof(struct sctp_status); + break; } - break; case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; @@ -2585,15 +2637,22 @@ flags_out: srto->srto_min = stcb->asoc.minrto; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - srto->srto_initial = inp->sctp_ep.initial_rto; - srto->srto_max = inp->sctp_ep.sctp_maxrto; - srto->srto_min = inp->sctp_ep.sctp_minrto; - SCTP_INP_RUNLOCK(inp); + if (srto->srto_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + srto->srto_initial = inp->sctp_ep.initial_rto; + srto->srto_max = inp->sctp_ep.sctp_maxrto; + srto->srto_min = inp->sctp_ep.sctp_minrto; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_rtoinfo); } - *optsize = sizeof(*srto); + break; } - break; case SCTP_TIMEOUTS: { struct sctp_timeouts *stimo; @@ -2610,23 +2669,21 @@ flags_out: stimo->stimo_cookie = stcb->asoc.timocookie; stimo->stimo_shutdownack = stcb->asoc.timoshutdownack; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_timeouts); } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - *optsize = sizeof(*stimo); + break; } - break; case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; - uint32_t oldval; SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize); SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); if (stcb) { - oldval = sasoc->sasoc_cookie_life; sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life); sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times; sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; @@ -2634,17 +2691,24 @@ flags_out: sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); - sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; - sasoc->sasoc_number_peer_destinations = 0; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); - SCTP_INP_RUNLOCK(inp); + if (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); + sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; + sasoc->sasoc_number_peer_destinations = 0; + sasoc->sasoc_peer_rwnd = 0; + sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*sasoc); + if (error == 0) { + *optsize = sizeof(struct sctp_assocparams); + } + break; } - break; case SCTP_DEFAULT_SEND_PARAM: { struct sctp_sndrcvinfo *s_info; @@ -2656,13 +2720,20 @@ flags_out: memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send)); SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); - SCTP_INP_RUNLOCK(inp); + if (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_sndrcvinfo); } - *optsize = sizeof(*s_info); + break; } - break; case SCTP_INITMSG: { struct sctp_initmsg *sinit; @@ -2674,9 +2745,9 @@ flags_out: sinit->sinit_max_attempts = inp->sctp_ep.max_init_times; sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max; SCTP_INP_RUNLOCK(inp); - *optsize = sizeof(*sinit); + *optsize = sizeof(struct sctp_initmsg); + break; } - break; case SCTP_PRIMARY_ADDR: /* we allow a "get" operation on this */ { @@ -2697,14 +2768,13 @@ flags_out: &stcb->asoc.primary_destination->ro._l_addr, len); SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_setprim); } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - *optsize = sizeof(*ssp); + break; } - break; - case SCTP_HMAC_IDENT: { struct sctp_hmacalgo *shmac; @@ -2726,7 +2796,7 @@ flags_out: size = sizeof(*shmac) + (hmaclist->num_algo * sizeof(shmac->shmac_idents[0])); if ((size_t)(*optsize) < size) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_INP_RUNLOCK(inp); break; @@ -2752,12 +2822,19 @@ flags_out: scact->scact_keynumber = stcb->asoc.authinfo.active_keyid; SCTP_TCB_UNLOCK(stcb); } else { - /* get the endpoint active key */ - SCTP_INP_RLOCK(inp); - scact->scact_keynumber = inp->sctp_ep.default_keyid; - SCTP_INP_RUNLOCK(inp); + if (scact->scact_assoc_id == SCTP_FUTURE_ASSOC) { + /* get the endpoint active key */ + SCTP_INP_RLOCK(inp); + scact->scact_keynumber = inp->sctp_ep.default_keyid; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_authkeyid); } - *optsize = sizeof(*scact); break; } case SCTP_LOCAL_AUTH_CHUNKS: @@ -2780,24 +2857,30 @@ flags_out: } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { - /* get off the endpoint */ - SCTP_INP_RLOCK(inp); - chklist = inp->sctp_ep.local_auth_chunks; - /* is there enough space? */ - size = sctp_auth_get_chklist_size(chklist); - if (*optsize < (sizeof(struct sctp_authchunks) + size)) { - error = EINVAL; - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + if (sac->gauth_assoc_id == SCTP_FUTURE_ASSOC) { + /* get off the endpoint */ + SCTP_INP_RLOCK(inp); + chklist = inp->sctp_ep.local_auth_chunks; + /* is there enough space? */ + size = sctp_auth_get_chklist_size(chklist); + if (*optsize < (sizeof(struct sctp_authchunks) + size)) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + /* copy in the chunks */ + (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; + } + SCTP_INP_RUNLOCK(inp); } else { - /* copy in the chunks */ - (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_authchunks) + size; break; } case SCTP_PEER_AUTH_CHUNKS: @@ -2820,23 +2903,162 @@ flags_out: } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } - *optsize = sizeof(struct sctp_authchunks) + size; break; } + case SCTP_EVENT: + { + struct sctp_event *event; + uint32_t event_type; + + SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, *optsize); + SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); + + switch (event->se_type) { + case SCTP_ASSOC_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; + break; + case SCTP_PEER_ADDR_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; + break; + case SCTP_REMOTE_ERROR: + event_type = SCTP_PCB_FLAGS_RECVPEERERR; + break; + case SCTP_SEND_FAILED: + event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; + break; + case SCTP_SHUTDOWN_EVENT: + event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; + break; + case SCTP_ADAPTATION_INDICATION: + event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; + break; + case SCTP_PARTIAL_DELIVERY_EVENT: + event_type = SCTP_PCB_FLAGS_PDAPIEVNT; + break; + case SCTP_AUTHENTICATION_EVENT: + event_type = SCTP_PCB_FLAGS_AUTHEVNT; + break; + case SCTP_STREAM_RESET_EVENT: + event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; + break; + case SCTP_SENDER_DRY_EVENT: + event_type = SCTP_PCB_FLAGS_DRYEVNT; + break; + case SCTP_NOTIFICATIONS_STOPPED_EVENT: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + default: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (event_type > 0) { + if (stcb) { + event->se_on = sctp_stcb_is_feature_on(inp, stcb, event_type); + SCTP_TCB_UNLOCK(stcb); + } else { + if (event->se_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + event->se_on = sctp_is_feature_on(inp, event_type); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_event); + } + break; + } + case SCTP_RECVRCVINFO: + { + int onoff; + + if (*optsize < sizeof(int)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_RUNLOCK(inp); + onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + SCTP_INP_RUNLOCK(inp); + } + if (error == 0) { + /* return the option value */ + *(int *)optval = onoff; + *optsize = sizeof(int); + } + break; + } + case SCTP_RECVNXTINFO: + { + int onoff; + + if (*optsize < sizeof(int)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_RUNLOCK(inp); + onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + SCTP_INP_RUNLOCK(inp); + } + if (error == 0) { + /* return the option value */ + *(int *)optval = onoff; + *optsize = sizeof(int); + } + break; + } + case SCTP_DEFAULT_SNDINFO: + { + struct sctp_sndinfo *info; + SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); + if (stcb) { + info->snd_sid = stcb->asoc.def_send.sinfo_stream; + info->snd_flags = stcb->asoc.def_send.sinfo_flags; + info->snd_ppid = stcb->asoc.def_send.sinfo_ppid; + info->snd_context = stcb->asoc.def_send.sinfo_context; + SCTP_TCB_UNLOCK(stcb); + } else { + if (info->snd_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + info->snd_sid = inp->def_send.sinfo_stream; + info->snd_flags = inp->def_send.sinfo_flags; + info->snd_ppid = inp->def_send.sinfo_ppid; + info->snd_context = inp->def_send.sinfo_context; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_sndinfo); + } + break; + } default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; - *optsize = 0; break; } /* end switch (sopt->sopt_name) */ + if (error) { + *optsize = 0; + } return (error); } @@ -2949,8 +3171,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); else sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE); + break; } - break; case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; @@ -2962,8 +3184,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, break; } inp->partial_delivery_point = *value; + break; } - break; case SCTP_FRAGMENT_INTERLEAVE: /* not yet until we re-write sctp_recvmsg() */ { @@ -2984,83 +3206,95 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } + break; } - break; case SCTP_CMT_ON_OFF: if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if (av->assoc_value > SCTP_CMT_MAX) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - if (av->assoc_value > SCTP_CMT_MAX) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - } else { - stcb->asoc.sctp_cmt_on_off = av->assoc_value; - } + stcb->asoc.sctp_cmt_on_off = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - if (av->assoc_value > SCTP_CMT_MAX) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - } else { + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_cmt_on_off = av->assoc_value; SCTP_INP_WUNLOCK(inp); } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.sctp_cmt_on_off = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + } + } } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } break; - /* JRS - Set socket option for pluggable congestion control */ case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; struct sctp_nets *net; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if ((av->assoc_value != SCTP_CC_RFC2581) && + (av->assoc_value != SCTP_CC_HSTCP) && + (av->assoc_value != SCTP_CC_HTCP) && + (av->assoc_value != SCTP_CC_RTCC)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - switch (av->assoc_value) { - case SCTP_CC_RFC2581: - case SCTP_CC_HSTCP: - case SCTP_CC_HTCP: - case SCTP_CC_RTCC: - stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; - stcb->asoc.congestion_control_module = av->assoc_value; - if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { - TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); - } + stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; + stcb->asoc.congestion_control_module = av->assoc_value; + if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); } - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; } SCTP_TCB_UNLOCK(stcb); } else { - switch (av->assoc_value) { - case SCTP_CC_RFC2581: - case SCTP_CC_HSTCP: - case SCTP_CC_HTCP: - case SCTP_CC_RTCC: + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_cc_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; + stcb->asoc.congestion_control_module = av->assoc_value; + if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); + } + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } } + break; } - break; case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; @@ -3068,7 +3302,19 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, optsize); SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id); if (stcb == NULL) { - error = EINVAL; + if (cc_opt->aid_value.assoc_id == SCTP_CURRENT_ASSOC) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (stcb->asoc.cc_functions.sctp_cwnd_socket_option) { + (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } else { + error = EINVAL; + } } else { if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; @@ -3078,54 +3324,54 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } + break; } - break; - /* RS - Set socket option for pluggable stream scheduling */ case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if ((av->assoc_value != SCTP_SS_DEFAULT) && + (av->assoc_value != SCTP_SS_DEFAULT) && + (av->assoc_value != SCTP_SS_ROUND_ROBIN) && + (av->assoc_value != SCTP_SS_ROUND_ROBIN_PACKET) && + (av->assoc_value != SCTP_SS_PRIORITY) && + (av->assoc_value != SCTP_SS_FAIR_BANDWITH) && + (av->assoc_value != SCTP_SS_FIRST_COME)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - switch (av->assoc_value) { - case SCTP_SS_DEFAULT: - case SCTP_SS_ROUND_ROBIN: - case SCTP_SS_ROUND_ROBIN_PACKET: - case SCTP_SS_PRIORITY: - case SCTP_SS_FAIR_BANDWITH: - case SCTP_SS_FIRST_COME: - stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); - stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; - stcb->asoc.stream_scheduling_module = av->assoc_value; - stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; - } + stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); + stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; + stcb->asoc.stream_scheduling_module = av->assoc_value; + stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); SCTP_TCB_UNLOCK(stcb); } else { - switch (av->assoc_value) { - case SCTP_SS_DEFAULT: - case SCTP_SS_ROUND_ROBIN: - case SCTP_SS_ROUND_ROBIN_PACKET: - case SCTP_SS_PRIORITY: - case SCTP_SS_FAIR_BANDWITH: - case SCTP_SS_FIRST_COME: + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_ss_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); + stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; + stcb->asoc.stream_scheduling_module = av->assoc_value; + stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } } + break; } - break; case SCTP_SS_VALUE: { struct sctp_stream_value *av; @@ -3140,15 +3386,29 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - /* - * Can't set stream value without - * association - */ - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; + if (av->assoc_id == SCTP_CURRENT_ASSOC) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.ss_functions.sctp_ss_set_value(stcb, + &stcb->asoc, + &stcb->asoc.strmout[av->stream_id], + av->stream_value); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + + } else { + /* + * Can't set stream value without + * association + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } + break; } - break; case SCTP_CLR_STAT_LOG: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; @@ -3164,12 +3424,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, stcb->asoc.context = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - inp->sctp_context = av->assoc_value; - SCTP_INP_WUNLOCK(inp); + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->sctp_context = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.context = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_VRF_ID: { uint32_t *default_vrfid; @@ -3204,12 +3477,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (sack->sack_delay) { if (sack->sack_delay > SCTP_MAX_SACK_DELAY) sack->sack_delay = SCTP_MAX_SACK_DELAY; + if (MSEC_TO_TICKS(sack->sack_delay) < 1) { + sack->sack_delay = TICKS_TO_MSEC(1); + } } if (stcb) { if (sack->sack_delay) { - if (MSEC_TO_TICKS(sack->sack_delay) < 1) { - sack->sack_delay = TICKS_TO_MSEC(1); - } stcb->asoc.delayed_ack = sack->sack_delay; } if (sack->sack_freq) { @@ -3217,17 +3490,32 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sack->sack_delay) { - if (MSEC_TO_TICKS(sack->sack_delay) < 1) { - sack->sack_delay = TICKS_TO_MSEC(1); + if ((sack->sack_assoc_id == SCTP_FUTURE_ASSOC) || + (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sack->sack_delay) { + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); } - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); + if (sack->sack_freq) { + inp->sctp_ep.sctp_sack_freq = sack->sack_freq; + } + SCTP_INP_WUNLOCK(inp); } - if (sack->sack_freq) { - inp->sctp_ep.sctp_sack_freq = sack->sack_freq; + if ((sack->sack_assoc_id == SCTP_CURRENT_ASSOC) || + (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (sack->sack_delay) { + stcb->asoc.delayed_ack = sack->sack_delay; + } + if (sack->sack_freq) { + stcb->asoc.sack_freq = sack->sack_freq; + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3255,10 +3543,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize); SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id); - size = optsize - sizeof(*sca); + size = optsize - sizeof(struct sctp_authkey); if (stcb) { - /* set it on the assoc */ shared_keys = &stcb->asoc.shared_keys; /* clear the cached keys for this key id */ sctp_clear_cachedkeys(stcb, sca->sca_keynumber); @@ -3288,39 +3575,76 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_TCB_UNLOCK(stcb); } else { - /* set it on the endpoint */ - SCTP_INP_WLOCK(inp); - shared_keys = &inp->sctp_ep.shared_keys; - /* - * clear the cached keys on all assocs for - * this key id - */ - sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); - /* - * create the new shared key and - * insert/replace it - */ - if (size > 0) { - key = sctp_set_key(sca->sca_key, (uint32_t) size); - if (key == NULL) { + if ((sca->sca_assoc_id == SCTP_FUTURE_ASSOC) || + (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + shared_keys = &inp->sctp_ep.shared_keys; + /* + * clear the cached keys on all + * assocs for this key id + */ + sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); + /* + * create the new shared key and + * insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_INP_WUNLOCK(inp); + break; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_INP_WUNLOCK(inp); break; } - } - shared_key = sctp_alloc_sharedkey(); - if (shared_key == NULL) { - sctp_free_key(key); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); - error = ENOMEM; + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_INP_WUNLOCK(inp); - break; } - shared_key->key = key; - shared_key->keyid = sca->sca_keynumber; - error = sctp_insert_sharedkey(shared_keys, shared_key); - SCTP_INP_WUNLOCK(inp); + if ((sca->sca_assoc_id == SCTP_CURRENT_ASSOC) || + (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + shared_keys = &stcb->asoc.shared_keys; + /* + * clear the cached keys for + * this key id + */ + sctp_clear_cachedkeys(stcb, sca->sca_keynumber); + /* + * create the new shared key + * and insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); + SCTP_TCB_UNLOCK(stcb); + continue; + } + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } break; } @@ -3330,7 +3654,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_hmaclist_t *hmaclist; uint16_t hmacid; uint32_t i; - size_t found; SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize); @@ -3381,8 +3704,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *scact; - SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); /* set the active key on the right place */ @@ -3397,16 +3719,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - /* set the active key on the endpoint */ - SCTP_INP_WLOCK(inp); - if (sctp_auth_setactivekey_ep(inp, - scact->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((scact->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_auth_setactivekey_ep(inp, scact->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((scact->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_auth_setactivekey(stcb, scact->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3414,30 +3745,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *scdel; - SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id); /* delete the key from the right place */ if (stcb) { - if (sctp_delete_sharedkey(stcb, - scdel->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); + if (sctp_delete_sharedkey(stcb, scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sctp_delete_sharedkey_ep(inp, - scdel->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((scdel->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_delete_sharedkey_ep(inp, scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((scdel->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_delete_sharedkey(stcb, scdel->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3445,30 +3782,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *keyid; - SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id); /* deactivate the key from the right place */ if (stcb) { - if (sctp_deact_sharedkey(stcb, - keyid->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); + if (sctp_deact_sharedkey(stcb, keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sctp_deact_sharedkey_ep(inp, - keyid->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((keyid->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_deact_sharedkey_ep(inp, keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((keyid->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_deact_sharedkey(stcb, keyid->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3632,9 +3975,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); + break; } - break; - case SCTP_CONNECT_X: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3643,7 +3985,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } error = sctp_do_connect_x(so, inp, optval, optsize, p, 0); break; - case SCTP_CONNECT_X_DELAYED: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3652,7 +3993,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } error = sctp_do_connect_x(so, inp, optval, optsize, p, 1); break; - case SCTP_CONNECT_X_COMPLETE: { struct sockaddr *sa; @@ -3706,8 +4046,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = EALREADY; } SCTP_TCB_UNLOCK(stcb); + break; } - break; case SCTP_MAX_BURST: { struct sctp_assoc_value *av; @@ -3719,12 +4059,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, stcb->asoc.max_burst = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - inp->sctp_ep.max_burst = av->assoc_value; - SCTP_INP_WUNLOCK(inp); + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->sctp_ep.max_burst = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.max_burst = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_MAXSEG: { struct sctp_assoc_value *av; @@ -3746,20 +4099,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - /* - * FIXME MT: I think this is not in tune - * with the API ID - */ - if (av->assoc_value) { - inp->sctp_frag_point = (av->assoc_value + ovh); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + /* + * FIXME MT: I think this is not in + * tune with the API ID + */ + if (av->assoc_value) { + inp->sctp_frag_point = (av->assoc_value + ovh); + } else { + inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + } + SCTP_INP_WUNLOCK(inp); } else { - inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_EVENTS: { struct sctp_event_subscribe *events; @@ -3823,22 +4181,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (events->sctp_sender_dry_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT); - if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || - (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { - stcb = LIST_FIRST(&inp->sctp_asoc_list); - if (stcb) { - SCTP_TCB_LOCK(stcb); - } - if (stcb && - TAILQ_EMPTY(&stcb->asoc.send_queue) && - TAILQ_EMPTY(&stcb->asoc.sent_queue) && - (stcb->asoc.stream_queue_cnt == 0)) { - sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); - } - if (stcb) { - SCTP_TCB_UNLOCK(stcb); - } - } } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT); } @@ -3849,9 +4191,84 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } SCTP_INP_WUNLOCK(inp); - } - break; + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (events->sctp_association_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } + if (events->sctp_address_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); + } + if (events->sctp_send_failure_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } + if (events->sctp_peer_error_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); + } + if (events->sctp_shutdown_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } + if (events->sctp_partial_delivery_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); + } + if (events->sctp_adaptation_layer_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } + if (events->sctp_authentication_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); + } + if (events->sctp_sender_dry_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); + } + if (events->sctp_stream_reset_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } + SCTP_TCB_UNLOCK(stcb); + } + /* + * Send up the sender dry event only for 1-to-1 + * style sockets. + */ + if (events->sctp_sender_dry_event) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb) { + SCTP_TCB_LOCK(stcb); + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); + } + SCTP_TCB_UNLOCK(stcb); + } + } + } + SCTP_INP_RUNLOCK(inp); + break; + } case SCTP_ADAPTATION_LAYER: { struct sctp_setadaptation *adap_bits; @@ -3860,8 +4277,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_WLOCK(inp); inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind; SCTP_INP_WUNLOCK(inp); + break; } - break; #ifdef SCTP_DEBUG case SCTP_SET_INITIAL_DBG_SEQ: { @@ -3871,8 +4288,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_WLOCK(inp); inp->sctp_ep.initial_sequence_debug = *vvv; SCTP_INP_WUNLOCK(inp); + break; } - break; #endif case SCTP_DEFAULT_SEND_PARAM: { @@ -3882,7 +4299,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); if (stcb) { - if (s_info->sinfo_stream <= stcb->asoc.streamoutcnt) { + if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3890,12 +4307,27 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); - SCTP_INP_WUNLOCK(inp); + if ((s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) || + (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); + SCTP_INP_WUNLOCK(inp); + } + if ((s_info->sinfo_assoc_id == SCTP_CURRENT_ASSOC) || + (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { + memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_PEER_ADDR_PARAMS: /* Applys to the specific association */ { @@ -4116,31 +4548,37 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); } else { /************************NO TCB, SET TO default stuff ******************/ - SCTP_INP_WLOCK(inp); - /* - * For the TOS/FLOWLABEL stuff you set it - * with the options on the socket - */ - if (paddrp->spp_pathmaxrxt) { - inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; - } - if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; - else if (paddrp->spp_hbinterval) { - if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) - paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); - } - if (paddrp->spp_flags & SPP_HB_ENABLE) { - sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + if (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + /* + * For the TOS/FLOWLABEL stuff you + * set it with the options on the + * socket + */ + if (paddrp->spp_pathmaxrxt) { + inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; + } + if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; + else if (paddrp->spp_hbinterval) { + if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) + paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); + } + if (paddrp->spp_flags & SPP_HB_ENABLE) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); - } else if (paddrp->spp_flags & SPP_HB_DISABLE) { - sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + } else if (paddrp->spp_flags & SPP_HB_DISABLE) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; @@ -4172,31 +4610,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (srto->srto_initial) - new_init = srto->srto_initial; - else - new_init = inp->sctp_ep.initial_rto; - if (srto->srto_max) - new_max = srto->srto_max; - else - new_max = inp->sctp_ep.sctp_maxrto; - if (srto->srto_min) - new_min = srto->srto_min; - else - new_min = inp->sctp_ep.sctp_minrto; - if ((new_min <= new_init) && (new_init <= new_max)) { - inp->sctp_ep.initial_rto = new_init; - inp->sctp_ep.sctp_maxrto = new_max; - inp->sctp_ep.sctp_minrto = new_min; + if (srto->srto_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + if (srto->srto_initial) + new_init = srto->srto_initial; + else + new_init = inp->sctp_ep.initial_rto; + if (srto->srto_max) + new_max = srto->srto_max; + else + new_max = inp->sctp_ep.sctp_maxrto; + if (srto->srto_min) + new_min = srto->srto_min; + else + new_min = inp->sctp_ep.sctp_minrto; + if ((new_min <= new_init) && (new_init <= new_max)) { + inp->sctp_ep.initial_rto = new_init; + inp->sctp_ep.sctp_maxrto = new_max; + inp->sctp_ep.sctp_minrto = new_min; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; @@ -4214,27 +4657,26 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (stcb) { if (sasoc->sasoc_asocmaxrxt) stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt; - sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = 0; if (sasoc->sasoc_cookie_life) { stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sasoc->sasoc_asocmaxrxt) - inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; - sasoc->sasoc_number_peer_destinations = 0; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = 0; - if (sasoc->sasoc_cookie_life) { - inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + if (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + if (sasoc->sasoc_asocmaxrxt) + inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; + if (sasoc->sasoc_cookie_life) { + inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_INITMSG: { struct sctp_initmsg *sinit; @@ -4253,12 +4695,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (sinit->sinit_max_init_timeo) inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo; SCTP_INP_WUNLOCK(inp); + break; } - break; case SCTP_PRIMARY_ADDR: { struct sctp_setprim *spa; - struct sctp_nets *net, *lnet; + struct sctp_nets *net; SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize); SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id); @@ -4287,7 +4729,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if ((net != stcb->asoc.primary_destination) && (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) { /* Ok we need to set it */ - lnet = stcb->asoc.primary_destination; if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { if (net->dest_state & SCTP_ADDR_SWITCH_PRIMARY) { net->dest_state |= SCTP_ADDR_DOUBLE_SWITCH; @@ -4302,8 +4743,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (stcb) { SCTP_TCB_UNLOCK(stcb); } + break; } - break; case SCTP_SET_DYNAMIC_PRIMARY: { union sctp_sockstore *ss; @@ -4316,8 +4757,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize); /* SUPER USER CHECK? */ error = sctp_dynamic_set_primary(&ss->sa, vrf_id); + break; } - break; case SCTP_SET_PEER_PRIMARY_ADDR: { struct sctp_setpeerprim *sspp; @@ -4370,9 +4811,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - + break; } - break; case SCTP_BINDX_ADD_ADDR: { struct sctp_getaddresses *addrs; @@ -4418,8 +4858,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_bindx_add_address(so, inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error, p); + break; } - break; case SCTP_BINDX_REM_ADDR: { struct sctp_getaddresses *addrs; @@ -4465,8 +4905,187 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_bindx_delete_address(so, inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error); + break; + } + case SCTP_EVENT: + { + struct sctp_event *event; + uint32_t event_type; + + SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, optsize); + SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); + switch (event->se_type) { + case SCTP_ASSOC_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; + break; + case SCTP_PEER_ADDR_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; + break; + case SCTP_REMOTE_ERROR: + event_type = SCTP_PCB_FLAGS_RECVPEERERR; + break; + case SCTP_SEND_FAILED: + event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; + break; + case SCTP_SHUTDOWN_EVENT: + event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; + break; + case SCTP_ADAPTATION_INDICATION: + event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; + break; + case SCTP_PARTIAL_DELIVERY_EVENT: + event_type = SCTP_PCB_FLAGS_PDAPIEVNT; + break; + case SCTP_AUTHENTICATION_EVENT: + event_type = SCTP_PCB_FLAGS_AUTHEVNT; + break; + case SCTP_STREAM_RESET_EVENT: + event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; + break; + case SCTP_SENDER_DRY_EVENT: + event_type = SCTP_PCB_FLAGS_DRYEVNT; + break; + case SCTP_NOTIFICATIONS_STOPPED_EVENT: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + default: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (event_type > 0) { + if (stcb) { + if (event->se_on) { + sctp_stcb_feature_on(inp, stcb, event_type); + if (event_type == SCTP_PCB_FLAGS_DRYEVNT) { + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); + } + } + } else { + sctp_stcb_feature_off(inp, stcb, event_type); + } + SCTP_TCB_UNLOCK(stcb); + } else { + /* + * We don't want to send up a storm + * of events, so return an error for + * sender dry events + */ + if ((event_type == SCTP_PCB_FLAGS_DRYEVNT) && + ((event->se_assoc_id == SCTP_ALL_ASSOC) || + (event->se_assoc_id == SCTP_CURRENT_ASSOC))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + } + if ((event->se_assoc_id == SCTP_FUTURE_ASSOC) || + (event->se_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (event->se_on) { + sctp_feature_on(inp, event_type); + } else { + sctp_feature_off(inp, event_type); + } + SCTP_INP_WUNLOCK(inp); + } + if ((event->se_assoc_id == SCTP_CURRENT_ASSOC) || + (event->se_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (event->se_on) { + sctp_stcb_feature_on(inp, stcb, event_type); + } else { + sctp_stcb_feature_off(inp, stcb, event_type); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } + } + break; + } + case SCTP_RECVRCVINFO: + { + int *onoff; + + SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); + SCTP_INP_WLOCK(inp); + if (*onoff != 0) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + } + SCTP_INP_WUNLOCK(inp); + break; + } + case SCTP_RECVNXTINFO: + { + int *onoff; + + SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); + SCTP_INP_WLOCK(inp); + if (*onoff != 0) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + } + SCTP_INP_WUNLOCK(inp); + break; + } + case SCTP_DEFAULT_SNDINFO: + { + struct sctp_sndinfo *info; + + SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, optsize); + SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); + + if (stcb) { + if (info->snd_sid < stcb->asoc.streamoutcnt) { + stcb->asoc.def_send.sinfo_stream = info->snd_sid; + stcb->asoc.def_send.sinfo_flags = info->snd_flags; + stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; + stcb->asoc.def_send.sinfo_context = info->snd_context; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + if ((info->snd_assoc_id == SCTP_FUTURE_ASSOC) || + (info->snd_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->def_send.sinfo_stream = info->snd_sid; + inp->def_send.sinfo_flags = info->snd_flags; + inp->def_send.sinfo_ppid = info->snd_ppid; + inp->def_send.sinfo_context = info->snd_context; + SCTP_INP_WUNLOCK(inp); + } + if ((info->snd_assoc_id == SCTP_CURRENT_ASSOC) || + (info->snd_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (info->snd_sid < stcb->asoc.streamoutcnt) { + stcb->asoc.def_send.sinfo_stream = info->snd_sid; + stcb->asoc.def_send.sinfo_flags = info->snd_flags; + stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; + stcb->asoc.def_send.sinfo_context = info->snd_context; + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } + break; } - break; default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; diff --git a/sys/netinet/sctp_var.h b/sys/netinet/sctp_var.h index 1e17900..e48dfe4 100644 --- a/sys/netinet/sctp_var.h +++ b/sys/netinet/sctp_var.h @@ -50,6 +50,30 @@ extern struct pr_usrreqs sctp_usrreqs; #define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature) #define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0) +#define sctp_stcb_feature_on(inp, stcb, feature) {\ + if (stcb) { \ + stcb->asoc.sctp_features |= feature; \ + } else { \ + inp->sctp_features |= feature; \ + } \ +} +#define sctp_stcb_feature_off(inp, stcb, feature) {\ + if (stcb) { \ + stcb->asoc.sctp_features &= ~feature; \ + } else { \ + inp->sctp_features &= ~feature; \ + } \ +} +#define sctp_stcb_is_feature_on(inp, stcb, feature) \ + (((stcb != NULL) && \ + ((stcb->asoc.sctp_features & feature) == feature)) || \ + ((stcb == NULL) && \ + ((inp->sctp_features & feature) == feature))) +#define sctp_stcb_is_feature_off(inp, stcb, feature) \ + (((stcb != NULL) && \ + ((stcb->asoc.sctp_features & feature) == 0)) || \ + ((stcb == NULL) && \ + ((inp->sctp_features & feature) == 0))) /* managing mobility_feature in inpcb (by micchie) */ #define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature) diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index 8f04bf1..9a8bd2e 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -923,6 +923,7 @@ sctp_init_asoc(struct sctp_inpcb *m, struct sctp_tcb *stcb, asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off); asoc->sctp_cmt_pf = (uint8_t) SCTP_BASE_SYSCTL(sctp_cmt_pf); asoc->sctp_frag_point = m->sctp_frag_point; + asoc->sctp_features = m->sctp_features; #ifdef INET asoc->default_tos = m->ip_inp.inp.inp_ip_tos; #else @@ -2760,7 +2761,7 @@ sctp_notify_assoc_change(uint32_t event, struct sctp_tcb *stcb, } #endif } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { /* event not enabled */ return; } @@ -2831,7 +2832,7 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, struct sctp_paddr_change *spc; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVPADDREVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT)) { /* event not enabled */ return; } @@ -2914,7 +2915,7 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; int length; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { /* event not enabled */ return; } @@ -2997,7 +2998,7 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; int length; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { /* event not enabled */ return; } @@ -3067,7 +3068,7 @@ sctp_notify_adaptation_layer(struct sctp_tcb *stcb, struct sctp_adaptation_event *sai; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { /* event not enabled */ return; } @@ -3118,7 +3119,7 @@ sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; struct sockbuf *sb; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_PDAPIEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_PDAPIEVNT)) { /* event not enabled */ return; } @@ -3231,7 +3232,7 @@ sctp_notify_shutdown_event(struct sctp_tcb *stcb) SCTP_SOCKET_UNLOCK(so, 1); #endif } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { /* event not enabled */ return; } @@ -3278,7 +3279,7 @@ sctp_notify_sender_dry_event(struct sctp_tcb *stcb, struct sctp_sender_dry_event *event; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_DRYEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DRYEVNT)) { /* event not enabled */ return; } @@ -5490,7 +5491,8 @@ found_one: if ((sinfo) && filling_sinfo) { memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo)); nxt = TAILQ_NEXT(control, next); - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; @@ -5997,7 +5999,8 @@ out: if (((out_flags & MSG_EOR) == 0) && ((in_flags & MSG_PEEK) == 0) && (sinfo) && - (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO))) { + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO))) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; @@ -6147,8 +6150,9 @@ sctp_soreceive(struct socket *so, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } - if ((sctp_is_feature_off(inp, - SCTP_PCB_FLAGS_RECVDATAIOEVNT)) || + if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) || (controlp == NULL)) { /* user does not want the sndrcv ctl */ filling_sinfo = 0; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 4b5fa10..32cb81e 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1331,7 +1331,7 @@ out: * then remember the size of the advertised window. * Any pending ACK has now been sent. */ - if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) + if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 231ce94..fe3e48a 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -78,6 +78,61 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +struct mtx nd6sock_mtx; +#define ND6SOCK_LOCK_INIT() \ + mtx_init(&nd6sock_mtx, "nd6sock_mtx", NULL, MTX_DEF) +#define ND6SOCK_LOCK() mtx_lock(&nd6sock_mtx) +#define ND6SOCK_UNLOCK() mtx_unlock(&nd6sock_mtx) +#define ND6SOCK_LOCK_DESTROY() mtx_destroy(&nd6sock_mtx) + +statuc struct pr_usrreqs nd6_usrreqs = { + .pru_attach = nd6_attach, + .pru_send = nd6_send, + .pru_detach = nd6_close, +}; + +static struct protosw nd6_protosw[] = { + .pr_type = SOCK_RAW, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_ctlinput = raw_ctlinput, + .pr_init = raw_init, + .pr_protocol = IPPROTO_ND6, + .pr_usrreqs = &nd6_usrreqs, +}; + +static int +nd6_attach(struct socket *so, int proto, struct thread *td) +{ + int error; + + ND6SOCK_LOCK(); + if (V_nd6_so != NULL) { + ND6SOCK_UNLOCK(); + return (EEXIST); + } + + error = priv_check(td, PRIV_NETINET_RAW); + if (error) { + SEND_UNLOCK(); + return(error); + } + + if (proto != IPPROTO_SEND) { + SEND_UNLOCK(); + return (EPROTONOSUPPORT); + } + error = soreserve(so, send_sendspace, send_recvspace); + if (error) { + SEND_UNLOCK(); + return(error); + } + + V_send_so = so; + SEND_UNLOCK(); + + return (0); +} + #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ diff --git a/sys/netinet6/nd6.h b/sys/netinet6/nd6.h index 6f63192..8ba3723 100644 --- a/sys/netinet6/nd6.h +++ b/sys/netinet6/nd6.h @@ -319,6 +319,16 @@ struct nd_pfxrouter { LIST_HEAD(nd_prhead, nd_prefix); +struct sockaddr_nd6 { + uint8_t snd6_len; /* total length */ + sa_family_t snd6_family; /* address family */ + int snd6_direction; + int snd6_ifidx; + char snd6_zero[8]; +}; + +extern int (*nd6_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); + /* nd6.c */ VNET_DECLARE(int, nd6_prune); VNET_DECLARE(int, nd6_delay); diff --git a/sys/pc98/conf/GENERIC b/sys/pc98/conf/GENERIC index 0fac1a9..e2bed44 100644 --- a/sys/pc98/conf/GENERIC +++ b/sys/pc98/conf/GENERIC @@ -139,12 +139,6 @@ device sc # Add suspend/resume support for the i8254. #device pmtimer -# Audio support -#device sound # Generic sound driver -#device snd_mss # Microsoft Sound System -#device "snd_sb16" # Sound Blaster 16 -#device snd_sbc # Sound Blaster - # PCCARD (PCMCIA) support # PCMCIA and cardbus bridge support device cbb # cardbus (yenta) bridge @@ -288,3 +282,10 @@ device bpf # Berkeley packet filter #device firewire # FireWire bus code #device sbp # SCSI over FireWire (Requires scbus and da) #device fwe # Ethernet over FireWire (non-standard!) + +# Sound support +#device sound # Generic sound driver (required) +#device snd_mss # Microsoft Sound System +#device "snd_sb16" # Sound Blaster 16 +#device snd_sbc # Sound Blaster +#device snd_uaudio # USB Audio diff --git a/sys/powerpc/conf/GENERIC b/sys/powerpc/conf/GENERIC index 54d4639..2950a49 100644 --- a/sys/powerpc/conf/GENERIC +++ b/sys/powerpc/conf/GENERIC @@ -174,7 +174,9 @@ device sbp # SCSI over FireWire (Requires scbus and da) device fwe # Ethernet over FireWire (non-standard!) # Misc -device ad7417 # PowerMac7,2 temperature sensor +device iicbus # I2C bus code +device kiic # Keywest I2C +device ad7417 # PowerMac7,2 temperature sensor device ds1775 # PowerMac7,2 temperature sensor device fcu # Apple Fan Control Unit device max6690 # PowerMac7,2 temperature sensor @@ -187,7 +189,9 @@ device adb device cuda device pmu -# Powermac I2C support -device iicbus # I2C bus code -device kiic # Keywest I2C +# Sound support +device sound # Generic sound driver (required) +device snd_ai2s # Apple I2S audio +device snd_davbus # Apple DAVBUS audio +device snd_uaudio # USB Audio diff --git a/sys/powerpc/conf/GENERIC64 b/sys/powerpc/conf/GENERIC64 index ab4a0dc..7e385a1 100644 --- a/sys/powerpc/conf/GENERIC64 +++ b/sys/powerpc/conf/GENERIC64 @@ -175,7 +175,9 @@ device sbp # SCSI over FireWire (Requires scbus and da) device fwe # Ethernet over FireWire (non-standard!) # Misc -device ad7417 # PowerMac7,2 temperature sensor +device iicbus # I2C bus code +device kiic # Keywest I2C +device ad7417 # PowerMac7,2 temperature sensor device ds1775 # PowerMac7,2 temperature sensor device fcu # Apple Fan Control Unit device max6690 # PowerMac7,2 temperature sensor @@ -186,7 +188,8 @@ device smu # Apple System Management Unit device adb device pmu -# Powermac I2C support -device iicbus # I2C bus code -device kiic # Keywest I2C +# Sound support +device sound # Generic sound driver (required) +device snd_ai2s # Apple I2S audio +device snd_uaudio # USB Audio diff --git a/sys/sparc64/conf/GENERIC b/sys/sparc64/conf/GENERIC index 406dcf6..11c5d35 100644 --- a/sys/sparc64/conf/GENERIC +++ b/sys/sparc64/conf/GENERIC @@ -61,13 +61,13 @@ options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options MAC # TrustedBSD MAC Framework -options INCLUDE_CONFIG_FILE # Include this file in kernel +options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging for use in -current options KDB # Enable kernel debugger support. options DDB # Support DDB. options GDB # Support remote GDB. -options DEADLKRES # Enable the deadlock resolver +options DEADLKRES # Enable the deadlock resolver options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles @@ -266,3 +266,10 @@ device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC 2734,3146) device dcons # Dumb console driver device dcons_crom # Configuration ROM for dcons + +# Sound support +device sound # Generic sound driver (required) +device snd_audiocs # Crystal Semiconductor CS4231 +device snd_es137x # Ensoniq AudioPCI ES137x +device snd_t4dwave # Acer Labs M5451 +device snd_uaudio # USB Audio diff --git a/sys/sparc64/include/smp.h b/sys/sparc64/include/smp.h index 1ba0d9e..a519e01 100644 --- a/sys/sparc64/include/smp.h +++ b/sys/sparc64/include/smp.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2001 Jake Burkholder. + * Copyright (c) 2007 - 2011 Marius Strobl <marius@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -136,8 +137,11 @@ extern char tl_ipi_tlb_range_demap[]; static __inline void ipi_all_but_self(u_int ipi) { + cpuset_t cpus; - cpu_ipi_selected(PCPU_GET(other_cpus), 0, (u_long)tl_ipi_level, ipi); + cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &cpus); + cpu_ipi_selected(cpus, 0, (u_long)tl_ipi_level, ipi); } static __inline void @@ -167,8 +171,9 @@ ipi_dcache_page_inval(void *func, vm_paddr_t pa) ica = &ipi_cache_args; mtx_lock_spin(&ipi_mtx); ica->ica_mask = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &ica->ica_mask); ica->ica_pa = pa; - cpu_ipi_selected(PCPU_GET(other_cpus), 0, (u_long)func, (u_long)ica); + cpu_ipi_selected(ica->ica_mask, 0, (u_long)func, (u_long)ica); return (&ica->ica_mask); } @@ -183,8 +188,9 @@ ipi_icache_page_inval(void *func, vm_paddr_t pa) ica = &ipi_cache_args; mtx_lock_spin(&ipi_mtx); ica->ica_mask = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &ica->ica_mask); ica->ica_pa = pa; - cpu_ipi_selected(PCPU_GET(other_cpus), 0, (u_long)func, (u_long)ica); + cpu_ipi_selected(ica->ica_mask, 0, (u_long)func, (u_long)ica); return (&ica->ica_mask); } @@ -198,8 +204,7 @@ ipi_rd(u_int cpu, void *func, u_long *val) sched_pin(); ira = &ipi_rd_args; mtx_lock_spin(&ipi_mtx); - ira->ira_mask = PCPU_GET(cpumask); - CPU_SET(cpu, &ira->ira_mask); + CPU_SETOF(cpu, &ira->ira_mask); ira->ira_val = val; cpu_ipi_single(cpu, 0, (u_long)func, (u_long)ira); return (&ira->ira_mask); @@ -215,7 +220,8 @@ ipi_tlb_context_demap(struct pmap *pm) return (NULL); sched_pin(); cpus = pm->pm_active; - CPU_AND(&cpus, PCPU_PTR(other_cpus)); + CPU_AND(&cpus, &all_cpus); + CPU_CLR(PCPU_GET(cpuid), &cpus); if (CPU_EMPTY(&cpus)) { sched_unpin(); return (NULL); @@ -223,7 +229,6 @@ ipi_tlb_context_demap(struct pmap *pm) ita = &ipi_tlb_args; mtx_lock_spin(&ipi_mtx); ita->ita_mask = cpus; - CPU_OR(&ita->ita_mask, PCPU_PTR(cpumask)); ita->ita_pmap = pm; cpu_ipi_selected(cpus, 0, (u_long)tl_ipi_tlb_context_demap, (u_long)ita); @@ -240,7 +245,8 @@ ipi_tlb_page_demap(struct pmap *pm, vm_offset_t va) return (NULL); sched_pin(); cpus = pm->pm_active; - CPU_AND(&cpus, PCPU_PTR(other_cpus)); + CPU_AND(&cpus, &all_cpus); + CPU_CLR(PCPU_GET(cpuid), &cpus); if (CPU_EMPTY(&cpus)) { sched_unpin(); return (NULL); @@ -248,7 +254,6 @@ ipi_tlb_page_demap(struct pmap *pm, vm_offset_t va) ita = &ipi_tlb_args; mtx_lock_spin(&ipi_mtx); ita->ita_mask = cpus; - CPU_OR(&ita->ita_mask, PCPU_PTR(cpumask)); ita->ita_pmap = pm; ita->ita_va = va; cpu_ipi_selected(cpus, 0, (u_long)tl_ipi_tlb_page_demap, (u_long)ita); @@ -265,7 +270,8 @@ ipi_tlb_range_demap(struct pmap *pm, vm_offset_t start, vm_offset_t end) return (NULL); sched_pin(); cpus = pm->pm_active; - CPU_AND(&cpus, PCPU_PTR(other_cpus)); + CPU_AND(&cpus, &all_cpus); + CPU_CLR(PCPU_GET(cpuid), &cpus); if (CPU_EMPTY(&cpus)) { sched_unpin(); return (NULL); @@ -273,7 +279,6 @@ ipi_tlb_range_demap(struct pmap *pm, vm_offset_t start, vm_offset_t end) ita = &ipi_tlb_args; mtx_lock_spin(&ipi_mtx); ita->ita_mask = cpus; - CPU_OR(&ita->ita_mask, PCPU_PTR(cpumask)); ita->ita_pmap = pm; ita->ita_start = start; ita->ita_end = end; @@ -288,7 +293,6 @@ ipi_wait(void *cookie) volatile cpuset_t *mask; if ((mask = cookie) != NULL) { - CPU_NAND_ATOMIC(mask, PCPU_PTR(cpumask)); while (!CPU_EMPTY(mask)) ; mtx_unlock_spin(&ipi_mtx); diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 0c7ed41..08e1582 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -280,6 +280,9 @@ struct cdev *make_dev_alias(struct cdev *_pdev, const char *_fmt, ...) __printflike(2, 3); int make_dev_alias_p(int _flags, struct cdev **_cdev, struct cdev *_pdev, const char *_fmt, ...) __printflike(4, 5); +int make_dev_physpath_alias(int _flags, struct cdev **_cdev, + struct cdev *_pdev, struct cdev *_old_alias, + const char *_physpath); void dev_lock(void); void dev_unlock(void); void setconf(void); diff --git a/sys/sys/disk.h b/sys/sys/disk.h index ba25c89..112eed0 100644 --- a/sys/sys/disk.h +++ b/sys/sys/disk.h @@ -116,4 +116,12 @@ void disk_err(struct bio *bp, const char *what, int blkdone, int nl); * This should be a multiple of the sector size. */ +#define DIOCGPHYSPATH _IOR('d', 141, char[MAXPATHLEN]) + /* + * Get a string defining the physical path for a given provider. + * This has similar rules to ident, but is intended to uniquely + * identify the physical location of the device, not the current + * occupant of that location. + */ + #endif /* _SYS_DISK_H_ */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 4d7b540..c54a956 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -503,6 +503,8 @@ struct proc { /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ + int p_dbg_child; /* (c + e) # of debugged children in + ptrace. */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 685b9e1..7d7866c 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -116,7 +116,6 @@ static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int, static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, int, int); -static void ffs_fserr(struct fs *, ino_t, char *); static ufs2_daddr_t ffs_hashalloc (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, @@ -223,7 +222,7 @@ nospace: goto retry; } UFS_UNLOCK(ump); - if (ppsratecheck(&lastfail, &curfail, 1)) { + if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); @@ -391,7 +390,7 @@ retry: bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, - ip->i_number, NULL); + ip->i_number, vp->v_type, NULL); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) @@ -432,7 +431,7 @@ nospace: UFS_UNLOCK(ump); if (bp) brelse(bp); - if (ppsratecheck(&lastfail, &curfail, 1)) { + if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); @@ -671,7 +670,7 @@ ffs_reallocblks_ufs1(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, NULL); + fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -879,7 +878,7 @@ ffs_reallocblks_ufs2(ap) if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ip->i_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), - fs->fs_bsize, ip->i_number, NULL); + fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, @@ -1881,7 +1880,7 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); - panic("ffs_blkfree: bad size"); + panic("ffs_blkfree_cg: bad size"); } #endif if ((u_int)bno >= fs->fs_size) { @@ -1915,7 +1914,7 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); - panic("ffs_blkfree: freeing free block"); + panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); @@ -1938,7 +1937,7 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); - panic("ffs_blkfree: freeing free frag"); + panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } @@ -2014,13 +2013,14 @@ ffs_blkfree_trim_completed(bip) } void -ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd) +ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + enum vtype vtype; struct workhead *dephd; { struct mount *mp; @@ -2035,7 +2035,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd) */ if (devvp->v_type != VREG && (devvp->v_vflag & VV_COPYONWRITE) && - ffs_snapblkfree(fs, devvp, bno, size, inum)) { + ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } if (!ump->um_candelete) { @@ -2335,7 +2335,7 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz) * The form of the error message is: * fs: error message */ -static void +void ffs_fserr(fs, inum, cp) struct fs *fs; ino_t inum; @@ -2572,7 +2572,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, ROOTINO, NULL); + blksize * fs->fs_fsize, ROOTINO, VDIR, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 4775768..63a4eba 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -105,6 +105,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; int unwindidx = -1; int saved_inbdflush; + static struct timeval lastfail; + static int curfail; int reclaimed; ip = VTOI(vp); @@ -308,6 +310,11 @@ retry: UFS_UNLOCK(ump); goto retry; } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } goto fail; } nb = newb; @@ -370,6 +377,11 @@ retry: UFS_UNLOCK(ump); goto retry; } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } goto fail; } nb = newb; @@ -494,7 +506,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number, NULL); + ip->i_number, vp->v_type, NULL); } return (error); } @@ -523,6 +535,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, int deallocated, osize, nsize, num, i, error; int unwindidx = -1; int saved_inbdflush; + static struct timeval lastfail; + static int curfail; int reclaimed; ip = VTOI(vp); @@ -836,6 +850,11 @@ retry: UFS_UNLOCK(ump); goto retry; } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } goto fail; } nb = newb; @@ -898,6 +917,11 @@ retry: UFS_UNLOCK(ump); goto retry; } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } goto fail; } nb = newb; @@ -1028,7 +1052,7 @@ fail: */ for (blkp = allociblk; blkp < allocblk; blkp++) { ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize, - ip->i_number, NULL); + ip->i_number, vp->v_type, NULL); } return (error); } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 58d6121..fb1b1fb 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -33,6 +33,7 @@ #ifndef _UFS_FFS_EXTERN_H #define _UFS_FFS_EXTERN_H +enum vtype; struct buf; struct cg; struct fid; @@ -57,7 +58,7 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size, struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, - ufs2_daddr_t, long, ino_t, struct workhead *); + ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); @@ -69,6 +70,7 @@ int ffs_flushfiles(struct mount *, int, struct thread *); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, int, struct workhead *); +void ffs_fserr(struct fs *, ino_t, char *); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); @@ -80,12 +82,14 @@ int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); int ffs_sbupdate(struct ufsmount *, int, int); void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); -int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t); +int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, + enum vtype, struct workhead *); void ffs_snapremove(struct vnode *vp); int ffs_snapshot(struct mount *mp, char *snapfile); void ffs_snapshot_mount(struct mount *mp); void ffs_snapshot_unmount(struct mount *mp); void process_deferred_inactive(struct mount *mp); +void ffs_sync_snap(struct mount *, int); int ffs_syncvnode(struct vnode *vp, int waitfor); int ffs_truncate(struct vnode *, off_t, int, struct ucred *, struct thread *); int ffs_update(struct vnode *, int); @@ -149,6 +153,9 @@ int softdep_prealloc(struct vnode *, int); int softdep_journal_lookup(struct mount *, struct vnode **); void softdep_journal_freeblocks(struct inode *, struct ucred *, off_t, int); void softdep_journal_fsync(struct inode *); +void softdep_buf_append(struct buf *, struct workhead *); +void softdep_inode_append(struct inode *, struct ucred *, struct workhead *); +void softdep_freework(struct workhead *); /* @@ -161,4 +168,16 @@ void softdep_journal_fsync(struct inode *); int ffs_rdonly(struct inode *); +#ifdef _KERNEL +TAILQ_HEAD(snaphead, inode); + +struct snapdata { + LIST_ENTRY(snapdata) sn_link; + struct snaphead sn_head; + daddr_t sn_listsize; + daddr_t *sn_blklist; + struct lock sn_lock; +}; +#endif /* _KERNEL */ + #endif /* !_UFS_FFS_EXTERN_H */ diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 67e91c7..a7b43e2 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -235,7 +235,8 @@ ffs_truncate(vp, length, flags, cred, td) if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i], - sblksize(fs, osize, i), ip->i_number, NULL); + sblksize(fs, osize, i), ip->i_number, + vp->v_type, NULL); } } } @@ -435,7 +436,8 @@ ffs_truncate(vp, length, flags, cred, td) if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ip->i_devvp, bn, - fs->fs_bsize, ip->i_number, NULL); + fs->fs_bsize, ip->i_number, + vp->v_type, NULL); blocksreleased += nblocks; } } @@ -455,7 +457,7 @@ ffs_truncate(vp, length, flags, cred, td) DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number, - NULL); + vp->v_type, NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -487,7 +489,7 @@ ffs_truncate(vp, length, flags, cred, td) */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ip->i_devvp, bn, - oldspace - newspace, ip->i_number, NULL); + oldspace - newspace, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(oldspace - newspace); } } @@ -634,7 +636,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) blocksreleased += blkcount; } ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize, - ip->i_number, NULL); + ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index 968be8a..8d236bd 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -81,12 +81,14 @@ ffs_snapshot(mp, snapfile) } int -ffs_snapblkfree(fs, devvp, bno, size, inum) +ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + enum vtype vtype; + struct workhead *wkhd; { return (EINVAL); } @@ -123,19 +125,16 @@ ffs_copyonwrite(devvp, bp) return (EINVAL); } +void +ffs_sync_snap(mp, waitfor) + struct mount *mp; + int waitfor; +{ +} + #else FEATURE(ffs_snapshot, "FFS snapshot support"); -TAILQ_HEAD(snaphead, inode); - -struct snapdata { - LIST_ENTRY(snapdata) sn_link; - struct snaphead sn_head; - daddr_t sn_listsize; - daddr_t *sn_blklist; - struct lock sn_lock; -}; - LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); @@ -176,8 +175,10 @@ static int ffs_bp_snapblk(struct vnode *, struct buf *); * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed - * penalty that this imposes, the following flag allows this - * crash persistence to be disabled. + * penalty that this imposes, the code normally only ensures + * persistence for the filesystem metadata contained within a + * snapshot. Setting the following flag allows this crash + * persistence to be enabled for file contents. */ int dopersistence = 0; @@ -584,7 +585,7 @@ loop: if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, - NULL); + xvp->v_type, NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } @@ -1247,7 +1248,8 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, + vp->v_type, NULL); } return (0); } @@ -1530,7 +1532,8 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); - ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); + ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, + vp->v_type, NULL); } return (0); } @@ -1635,7 +1638,7 @@ ffs_snapremove(vp) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, - ip->i_number))) { + ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); @@ -1660,7 +1663,8 @@ ffs_snapremove(vp) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, - fs->fs_bsize, ip->i_number))) { + fs->fs_bsize, ip->i_number, vp->v_type, + NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; @@ -1674,7 +1678,7 @@ ffs_snapremove(vp) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, - fs->fs_bsize, ip->i_number))) { + fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } @@ -1722,12 +1726,14 @@ ffs_snapremove(vp) * must always have been allocated from a BLK_NOCOPY location. */ int -ffs_snapblkfree(fs, devvp, bno, size, inum) +ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; + enum vtype vtype; + struct workhead *wkhd; { struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; @@ -1825,6 +1831,17 @@ retry: "Grabonremove: snapino", ip->i_number, (intmax_t)lbn, inum); #endif + /* + * If journaling is tracking this write we must add + * the work to the inode or indirect being written. + */ + if (wkhd != NULL) { + if (lbn < NDADDR) + softdep_inode_append(ip, + curthread->td_ucred, wkhd); + else + softdep_buf_append(ibp, wkhd); + } if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (ip->i_ump->um_fstype == UFS1) { @@ -1864,12 +1881,16 @@ retry: * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, - * to ensure their integrity. + * to ensure their integrity. At a minimum we ensure the + * integrity of the filesystem metadata, but use the + * dopersistence sysctl-setable flag to decide on the + * persistence needed for file content data. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); - if (dopersistence && ip->i_effnlink > 0) + if ((vtype == VDIR || dopersistence) && + ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); continue; } @@ -1879,7 +1900,8 @@ retry: if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); - if (dopersistence && ip->i_effnlink > 0) + if ((vtype == VDIR || dopersistence) && + ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); break; } @@ -1888,12 +1910,15 @@ retry: /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after - * a crash, to ensure their integrity. + * a crash, to ensure their integrity. At a minimum we + * ensure the integrity of the filesystem metadata, but + * use the dopersistence sysctl-setable flag to decide on + * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); - if (dopersistence && VTOI(vp)->i_effnlink > 0) + if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); } /* @@ -1902,6 +1927,8 @@ retry: * not be freed. Although space will be lost, the snapshot * will stay consistent. */ + if (error != 0 && wkhd != NULL) + softdep_freework(wkhd); lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (error); } @@ -2346,12 +2373,16 @@ ffs_copyonwrite(devvp, bp) * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, - * to ensure their integrity. + * to ensure their integrity. At a minimum we ensure the + * integrity of the filesystem metadata, but use the + * dopersistence sysctl-setable flag to decide on the + * persistence needed for file content data. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); - if (dopersistence && ip->i_effnlink > 0) + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; @@ -2363,7 +2394,8 @@ ffs_copyonwrite(devvp, bp) if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); - if (dopersistence && ip->i_effnlink > 0) + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; @@ -2374,12 +2406,16 @@ ffs_copyonwrite(devvp, bp) /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after - * a crash, to ensure their integrity. + * a crash, to ensure their integrity. At a minimum we + * ensure the integrity of the filesystem metadata, but + * use the dopersistence sysctl-setable flag to decide on + * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); - if (dopersistence && VTOI(vp)->i_effnlink > 0) + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; @@ -2400,6 +2436,42 @@ ffs_copyonwrite(devvp, bp) } /* + * sync snapshots to force freework records waiting on snapshots to claim + * blocks to free. + */ +void +ffs_sync_snap(mp, waitfor) + struct mount *mp; + int waitfor; +{ + struct snapdata *sn; + struct vnode *devvp; + struct vnode *vp; + struct inode *ip; + + devvp = VFSTOUFS(mp)->um_devvp; + if ((devvp->v_vflag & VV_COPYONWRITE) == 0) + return; + for (;;) { + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL) { + VI_UNLOCK(devvp); + return; + } + if (lockmgr(&sn->sn_lock, + LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, + VI_MTX(devvp)) == 0) + break; + } + TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { + vp = ITOV(ip); + ffs_syncvnode(vp, waitfor); + } + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); +} + +/* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index c125ee3..3734a5d 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -584,6 +584,33 @@ softdep_get_depcounts(struct mount *mp, *softdepactiveaccp = 0; } +void +softdep_buf_append(bp, wkhd) + struct buf *bp; + struct workhead *wkhd; +{ + + panic("softdep_buf_appendwork called"); +} + +void +softdep_inode_append(ip, cred, wkhd) + struct inode *ip; + struct ucred *cred; + struct workhead *wkhd; +{ + + panic("softdep_inode_appendwork called"); +} + +void +softdep_freework(wkhd) + struct workhead *wkhd; +{ + + panic("softdep_freework called"); +} + #else FEATURE(softupdates, "FFS soft-updates support"); @@ -867,7 +894,7 @@ static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); static void handle_workitem_indirblk(struct freework *); -static void handle_written_freework(struct freework *, int); +static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, struct workhead *); @@ -1632,6 +1659,7 @@ process_truncates(vp) if (cgwait) { FREE_LOCK(&lk); sync_cgs(mp, MNT_WAIT); + ffs_sync_snap(mp, MNT_WAIT); ACQUIRE_LOCK(&lk); continue; } @@ -2386,8 +2414,15 @@ softdep_unmount(mp) struct mount *mp; { - if (mp->mnt_kern_flag & MNTK_SUJ) - journal_unmount(mp); + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_SOFTDEP; + if ((mp->mnt_kern_flag & MNTK_SUJ) == 0) { + MNT_IUNLOCK(mp); + return; + } + mp->mnt_kern_flag &= ~MNTK_SUJ; + MNT_IUNLOCK(mp); + journal_unmount(mp); } struct jblocks { @@ -5137,6 +5172,7 @@ newfreefrag(ip, blkno, size, lbn) freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; + freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; @@ -5181,7 +5217,7 @@ handle_workitem_freefrag(freefrag) } FREE_LOCK(&lk); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, - freefrag->ff_fragsize, freefrag->ff_inum, &wkhd); + freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); @@ -5689,6 +5725,7 @@ newfreeblks(mp, ip) freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_inum = ip->i_number; + freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); freeblks->fb_devvp = ip->i_devvp; freeblks->fb_chkcnt = 0; @@ -5915,7 +5952,7 @@ complete_trunc_indir(freework) */ if (bp == NULL) { if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) - handle_written_freework(freework, 0); + handle_written_freework(freework); else WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, &freework->fw_list); @@ -5967,7 +6004,7 @@ out: */ if (totblks > datablocks) return (0); - return (totblks - datablocks); + return (datablocks - totblks); } /* @@ -7221,20 +7258,21 @@ freework_freeblock(freework) cancel_jnewblk(jnewblk, &wkhd); needj = 0; } else if (needj) { + freework->fw_state |= DELAYEDFREE; freeblks->fb_cgwait++; WORKLIST_INSERT(&wkhd, &freework->fw_list); } freeblks->fb_freecnt += btodb(bsize); FREE_LOCK(&lk); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, - freeblks->fb_inum, &wkhd); + freeblks->fb_inum, freeblks->fb_vtype, &wkhd); ACQUIRE_LOCK(&lk); /* * The jnewblk will be discarded and the bits in the map never * made it to disk. We can immediately free the freeblk. */ if (needj == 0) - handle_written_freework(freework, 0); + handle_written_freework(freework); } /* @@ -7249,7 +7287,8 @@ freework_enqueue(freework) struct freeblks *freeblks; freeblks = freework->fw_freeblks; - WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); + if ((freework->fw_state & INPROGRESS) == 0) + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); if ((freeblks->fb_state & (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) @@ -7275,13 +7314,14 @@ handle_workitem_indirblk(freework) ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; if (freework->fw_state & DEPCOMPLETE) { - handle_written_freework(freework, 0); + handle_written_freework(freework); return; } if (freework->fw_off == NINDIR(fs)) { freework_freeblock(freework); return; } + freework->fw_state |= INPROGRESS; FREE_LOCK(&lk); indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), freework->fw_lbn); @@ -7294,16 +7334,16 @@ handle_workitem_indirblk(freework) * the freeblks is added back to the worklist if there is more work to do. */ static void -handle_written_freework(freework, cgwrite) +handle_written_freework(freework) struct freework *freework; - int cgwrite; { struct freeblks *freeblks; struct freework *parent; freeblks = freework->fw_freeblks; parent = freework->fw_parent; - freeblks->fb_cgwait -= cgwrite; + if (freework->fw_state & DELAYEDFREE) + freeblks->fb_cgwait--; freework->fw_state |= COMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); @@ -7545,6 +7585,8 @@ indir_trunc(freework, dbn, lbn) return; } ACQUIRE_LOCK(&lk); + /* Protects against a race with complete_trunc_indir(). */ + freework->fw_state &= ~INPROGRESS; /* * If we have an indirdep we need to enforce the truncation order * and discard it when it is complete. @@ -7629,7 +7671,8 @@ indir_trunc(freework, dbn, lbn) freedeps++; } ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, - fs->fs_bsize, freeblks->fb_inum, &wkhd); + fs->fs_bsize, freeblks->fb_inum, + freeblks->fb_vtype, &wkhd); } } if (goingaway) { @@ -7662,13 +7705,13 @@ indir_trunc(freework, dbn, lbn) fs_pendingblocks += nblocks; dbn = dbtofsb(fs, dbn); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, - freeblks->fb_inum, NULL); + freeblks->fb_inum, freeblks->fb_vtype, NULL); /* Non SUJ softdep does single-threaded truncations. */ freeblks->fb_freecnt += fs_pendingblocks; if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; ACQUIRE_LOCK(&lk); - handle_written_freework(freework, 0); + handle_written_freework(freework); FREE_LOCK(&lk); } return; @@ -10361,8 +10404,7 @@ softdep_disk_write_complete(bp) continue; case D_FREEWORK: - /* Freework on an indirect block, not bmsafemap. */ - handle_written_freework(WK_FREEWORK(wk), 0); + handle_written_freework(WK_FREEWORK(wk)); break; case D_JSEGDEP: @@ -10378,6 +10420,10 @@ softdep_disk_write_complete(bp) WORKLIST_INSERT(&reattach, wk); continue; + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); @@ -10533,7 +10579,7 @@ handle_jwork(wkhd) free_freedep(WK_FREEDEP(wk)); continue; case D_FREEWORK: - handle_written_freework(WK_FREEWORK(wk), 1); + handle_written_freework(WK_FREEWORK(wk)); continue; default: panic("handle_jwork: Unknown type %s\n", @@ -12731,6 +12777,53 @@ clear_inodedeps(td) } } +void +softdep_buf_append(bp, wkhd) + struct buf *bp; + struct workhead *wkhd; +{ + struct worklist *wk; + + ACQUIRE_LOCK(&lk); + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&bp->b_dep, wk); + } + FREE_LOCK(&lk); + +} + +void +softdep_inode_append(ip, cred, wkhd) + struct inode *ip; + struct ucred *cred; + struct workhead *wkhd; +{ + struct buf *bp; + struct fs *fs; + int error; + + fs = ip->i_fs; + error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, cred, &bp); + if (error) { + softdep_freework(wkhd); + return; + } + softdep_buf_append(bp, wkhd); + bqrelse(bp); +} + +void +softdep_freework(wkhd) + struct workhead *wkhd; +{ + + ACQUIRE_LOCK(&lk); + handle_jwork(wkhd); + FREE_LOCK(&lk); +} + /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index c7e0bd1..b0f2d7e 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -269,6 +269,8 @@ ffs_mount(struct mount *mp) vfs_write_resume(mp); return (error); } + if (mp->mnt_flag & MNT_SOFTDEP) + softdep_unmount(mp); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index 9be175c..80c7315 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -511,6 +511,7 @@ struct freefrag { ufs2_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ + enum vtype ff_vtype; /* owning inode's file type */ }; /* @@ -538,6 +539,7 @@ struct freeblks { ufs2_daddr_t fb_chkcnt; /* Expected blks released. */ ufs2_daddr_t fb_freecnt; /* Actual blocks released. */ ino_t fb_inum; /* inode owner of blocks */ + enum vtype fb_vtype; /* inode owner's file type */ uid_t fb_uid; /* uid of previous owner of blocks */ int fb_ref; /* Children outstanding. */ int fb_cgwait; /* cg writes outstanding. */ diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 34b1758..733413d 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1838,6 +1838,8 @@ ufs_mkdir(ap) #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); @@ -1850,6 +1852,8 @@ ufs_mkdir(ap) #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); @@ -2608,6 +2612,8 @@ ufs_makeinode(mode, dvp, vpp, cnp) #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); @@ -2620,6 +2626,8 @@ ufs_makeinode(mode, dvp, vpp, cnp) #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index e2758ec..21c7080 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2636,6 +2636,23 @@ vm_page_cowsetup(vm_page_t m) return (0); } +#ifdef INVARIANTS +void +vm_page_object_lock_assert(vm_page_t m) +{ + + /* + * Certain of the page's fields may only be modified by the + * holder of the containing object's lock or the setter of the + * page's VPO_BUSY flag. Unfortunately, the setter of the + * VPO_BUSY flag is not recorded, and thus cannot be checked + * here. + */ + if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) + VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); +} +#endif + #include "opt_ddb.h" #ifdef DDB #include <sys/kernel.h> diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index c34d2f0..898857b 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -383,6 +383,13 @@ void vm_page_cowfault (vm_page_t); int vm_page_cowsetup(vm_page_t); void vm_page_cowclear (vm_page_t); +#ifdef INVARIANTS +void vm_page_object_lock_assert(vm_page_t m); +#define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) +#else +#define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 +#endif + /* * vm_page_sleep_if_busy: * @@ -412,6 +419,8 @@ vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg) static __inline void vm_page_undirty(vm_page_t m) { + + VM_PAGE_OBJECT_LOCK_ASSERT(m); m->dirty = 0; } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 98a5be0..a8eca20 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1195,8 +1195,13 @@ vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) { + vm_object_t obj; int i, pos; + if (written == 0) + return; + obj = ma[0]->object; + VM_OBJECT_LOCK(obj); for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; @@ -1207,4 +1212,5 @@ vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } + VM_OBJECT_UNLOCK(obj); } |