diff options
Diffstat (limited to 'sys')
78 files changed, 8974 insertions, 4324 deletions
diff --git a/sys/boot/common/gpt.c b/sys/boot/common/gpt.c index 62e86dd..9c00980 100644 --- a/sys/boot/common/gpt.c +++ b/sys/boot/common/gpt.c @@ -49,7 +49,7 @@ static int curent, bootonce; /* * Buffer below 64kB passed on gptread(), which can hold at least - * one sector od data (512 bytes). + * one sector of data (512 bytes). */ static char *secbuf; @@ -62,7 +62,7 @@ gptupdate(const char *which, struct dsk *dskp, struct gpt_hdr *hdr, /* * We need to update the following for both primary and backup GPT: - * 1. Sector on disk that contains curent partition. + * 1. Sector on disk that contains current partition. * 2. Partition table checksum. * 3. Header checksum. * 4. Header on disk. diff --git a/sys/boot/i386/boot2/boot2.c b/sys/boot/i386/boot2/boot2.c index f521fd7..307d4c5 100644 --- a/sys/boot/i386/boot2/boot2.c +++ b/sys/boot/i386/boot2/boot2.c @@ -348,7 +348,7 @@ load(void) return; p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); - memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); + *(uint32_t*)p = hdr.ex.a_syms; p += sizeof(hdr.ex.a_syms); if (hdr.ex.a_syms) { if (xfsread(ino, p, hdr.ex.a_syms)) @@ -385,7 +385,7 @@ load(void) if (xfsread(ino, &es, sizeof(es))) return; for (i = 0; i < 2; i++) { - memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); + *(Elf32_Word *)p = es[i].sh_size; p += sizeof(es[i].sh_size); fs_off = es[i].sh_offset; if (xfsread(ino, p, es[i].sh_size)) diff --git a/sys/conf/files b/sys/conf/files index 75f16e5..74f25c1 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3008,19 +3008,20 @@ xen/gnttab.c optional xen | xenhvm xen/features.c optional xen | xenhvm xen/evtchn/evtchn.c optional xen xen/evtchn/evtchn_dev.c optional xen | xenhvm -xen/reboot.c optional xen -xen/xenbus/xenbus_client.c optional xen | xenhvm -xen/xenbus/xenbus_comms.c optional xen | xenhvm -xen/xenbus/xenbus_dev.c optional xen | xenhvm xen/xenbus/xenbus_if.m optional xen | xenhvm -xen/xenbus/xenbus_probe.c optional xen | xenhvm -#xen/xenbus/xenbus_probe_backend.c optional xen -xen/xenbus/xenbus_xs.c optional xen | xenhvm +xen/xenbus/xenbus.c optional xen | xenhvm +xen/xenbus/xenbusb_if.m optional xen | xenhvm +xen/xenbus/xenbusb.c optional xen | xenhvm +xen/xenbus/xenbusb_front.c optional xen | xenhvm +xen/xenbus/xenbusb_back.c optional xen | xenhvm +xen/xenstore/xenstore.c optional xen | xenhvm +xen/xenstore/xenstore_dev.c optional xen | xenhvm dev/xen/balloon/balloon.c optional xen | xenhvm +dev/xen/blkfront/blkfront.c optional xen | xenhvm +dev/xen/blkback/blkback.c optional xen | xenhvm dev/xen/console/console.c optional xen dev/xen/console/xencons_ring.c optional xen -dev/xen/blkfront/blkfront.c optional xen | xenhvm +dev/xen/control/control.c optional xen | xenhvm dev/xen/netfront/netfront.c optional xen | xenhvm dev/xen/xenpci/xenpci.c optional xenpci dev/xen/xenpci/evtchn.c optional xenpci -dev/xen/xenpci/machine_reboot.c optional xenpci diff --git a/sys/dev/acpica/acpi_pci.c b/sys/dev/acpica/acpi_pci.c index bf7cf2e..a14e0ba 100644 --- a/sys/dev/acpica/acpi_pci.c +++ b/sys/dev/acpica/acpi_pci.c @@ -179,7 +179,7 @@ acpi_pci_set_powerstate_method(device_t dev, device_t child, int state) */ ACPI_SERIAL_BEGIN(pci_powerstate); old_state = pci_get_powerstate(child); - if (old_state < state) { + if (old_state < state && pci_do_power_suspend) { error = pci_set_powerstate_method(dev, child, state); if (error) goto out; diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c index 662713f..95de34c 100644 --- a/sys/dev/bge/if_bge.c +++ b/sys/dev/bge/if_bge.c @@ -374,6 +374,7 @@ static void bge_tick(void *); static void bge_stats_clear_regs(struct bge_softc *); static void bge_stats_update(struct bge_softc *); static void bge_stats_update_regs(struct bge_softc *); +static struct mbuf *bge_check_short_dma(struct mbuf *); static struct mbuf *bge_setup_tso(struct bge_softc *, struct mbuf *, uint16_t *); static int bge_encap(struct bge_softc *, struct mbuf **, uint32_t *); @@ -1692,6 +1693,11 @@ bge_blockinit(struct bge_softc *sc) bge_writembx(sc, BGE_MBX_RX_MINI_PROD_LO, 0); } + /* Choose de-pipeline mode for BCM5906 A1. */ + if (sc->bge_asicrev == BGE_ASICREV_BCM5906 && + sc->bge_chiprev == BGE_CHIPID_BCM5906_A1) + CSR_WRITE_4(sc, BGE_ISO_PKT_TX, + (CSR_READ_4(sc, BGE_ISO_PKT_TX) & ~3) | 2); /* * The BD ring replenish thresholds control how often the * hardware fetches new BD's from the producer rings in host @@ -2633,6 +2639,8 @@ bge_attach(device_t dev) case BGE_ASICREV_BCM5752: case BGE_ASICREV_BCM5906: sc->bge_flags |= BGE_FLAG_575X_PLUS; + if (sc->bge_asicrev == BGE_ASICREV_BCM5906) + sc->bge_flags |= BGE_FLAG_SHORT_DMA_BUG; /* FALLTHROUGH */ case BGE_ASICREV_BCM5705: sc->bge_flags |= BGE_FLAG_5705_PLUS; @@ -4060,6 +4068,39 @@ bge_cksum_pad(struct mbuf *m) } static struct mbuf * +bge_check_short_dma(struct mbuf *m) +{ + struct mbuf *n; + int found; + + /* + * If device receive two back-to-back send BDs with less than + * or equal to 8 total bytes then the device may hang. The two + * back-to-back send BDs must in the same frame for this failure + * to occur. Scan mbuf chains and see whether two back-to-back + * send BDs are there. If this is the case, allocate new mbuf + * and copy the frame to workaround the silicon bug. + */ + for (n = m, found = 0; n != NULL; n = n->m_next) { + if (n->m_len < 8) { + found++; + if (found > 1) + break; + continue; + } + found = 0; + } + + if (found > 1) { + n = m_defrag(m, M_DONTWAIT); + if (n == NULL) + m_freem(m); + } else + n = m; + return (n); +} + +static struct mbuf * bge_setup_tso(struct bge_softc *sc, struct mbuf *m, uint16_t *mss) { struct ip *ip; @@ -4132,6 +4173,13 @@ bge_encap(struct bge_softc *sc, struct mbuf **m_head, uint32_t *txidx) csum_flags = 0; mss = 0; vlan_tag = 0; + if ((sc->bge_flags & BGE_FLAG_SHORT_DMA_BUG) != 0 && + m->m_next != NULL) { + *m_head = bge_check_short_dma(m); + if (*m_head == NULL) + return (ENOBUFS); + m = *m_head; + } if ((m->m_pkthdr.csum_flags & CSUM_TSO) != 0) { *m_head = m = bge_setup_tso(sc, m, &mss); if (*m_head == NULL) @@ -4366,6 +4414,7 @@ bge_init_locked(struct bge_softc *sc) { struct ifnet *ifp; uint16_t *m; + uint32_t mode; BGE_LOCK_ASSERT(sc); @@ -4471,8 +4520,12 @@ bge_init_locked(struct bge_softc *sc) /* Init TX ring. */ bge_init_tx_ring(sc); + /* Enable TX MAC state machine lockup fix. */ + mode = CSR_READ_4(sc, BGE_TX_MODE); + if (BGE_IS_5755_PLUS(sc) || sc->bge_asicrev == BGE_ASICREV_BCM5906) + mode |= BGE_TXMODE_MBUF_LOCKUP_FIX; /* Turn on transmitter. */ - BGE_SETBIT(sc, BGE_TX_MODE, BGE_TXMODE_ENABLE); + CSR_WRITE_4(sc, BGE_TX_MODE, mode | BGE_TXMODE_ENABLE); /* Turn on receiver. */ BGE_SETBIT(sc, BGE_RX_MODE, BGE_RXMODE_ENABLE); diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h index 60a39f8..a4f3f83 100644 --- a/sys/dev/bge/if_bgereg.h +++ b/sys/dev/bge/if_bgereg.h @@ -765,6 +765,7 @@ #define BGE_TXMODE_FLOWCTL_ENABLE 0x00000010 #define BGE_TXMODE_BIGBACKOFF_ENABLE 0x00000020 #define BGE_TXMODE_LONGPAUSE_ENABLE 0x00000040 +#define BGE_TXMODE_MBUF_LOCKUP_FIX 0x00000100 /* Transmit MAC status register */ #define BGE_TXSTAT_RX_XOFFED 0x00000001 @@ -879,6 +880,7 @@ #define BGE_SDI_STATS_CTL 0x0C08 #define BGE_SDI_STATS_ENABLE_MASK 0x0C0C #define BGE_SDI_STATS_INCREMENT_MASK 0x0C10 +#define BGE_ISO_PKT_TX 0x0C20 #define BGE_LOCSTATS_COS0 0x0C80 #define BGE_LOCSTATS_COS1 0x0C84 #define BGE_LOCSTATS_COS2 0x0C88 @@ -2727,6 +2729,7 @@ struct bge_softc { #define BGE_FLAG_40BIT_BUG 0x01000000 #define BGE_FLAG_4G_BNDRY_BUG 0x02000000 #define BGE_FLAG_RX_ALIGNBUG 0x04000000 +#define BGE_FLAG_SHORT_DMA_BUG 0x08000000 uint32_t bge_phy_flags; #define BGE_PHY_WIRESPEED 0x00000001 #define BGE_PHY_ADC_BUG 0x00000002 diff --git a/sys/dev/iwi/if_iwi.c b/sys/dev/iwi/if_iwi.c index f5ba34f..62b53be 100644 --- a/sys/dev/iwi/if_iwi.c +++ b/sys/dev/iwi/if_iwi.c @@ -1356,7 +1356,7 @@ iwi_checkforqos(struct ieee80211vap *vap, wme = NULL; while (frm < efrm) { - IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], return); + IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], break); switch (*frm) { case IEEE80211_ELEMID_VENDOR: if (iswmeoui(frm)) @@ -1368,7 +1368,7 @@ iwi_checkforqos(struct ieee80211vap *vap, ni = vap->iv_bss; ni->ni_capinfo = capinfo; - ni->ni_associd = associd; + ni->ni_associd = associd & 0x3fff; if (wme != NULL) ni->ni_flags |= IEEE80211_NODE_QOS; else diff --git a/sys/dev/mfi/mfireg.h b/sys/dev/mfi/mfireg.h index 17ab4b3..e08a16d 100644 --- a/sys/dev/mfi/mfireg.h +++ b/sys/dev/mfi/mfireg.h @@ -975,7 +975,9 @@ enum mfi_pd_state { MFI_PD_STATE_OFFLINE = 0x10, MFI_PD_STATE_FAILED = 0x11, MFI_PD_STATE_REBUILD = 0x14, - MFI_PD_STATE_ONLINE = 0x18 + MFI_PD_STATE_ONLINE = 0x18, + MFI_PD_STATE_COPYBACK = 0x20, + MFI_PD_STATE_SYSTEM = 0x40 }; union mfi_ld_ref { diff --git a/sys/dev/mvs/mvs.c b/sys/dev/mvs/mvs.c index 11a8853..9968938 100644 --- a/sys/dev/mvs/mvs.c +++ b/sys/dev/mvs/mvs.c @@ -57,7 +57,8 @@ static int mvs_ch_deinit(device_t dev); static int mvs_ch_suspend(device_t dev); static int mvs_ch_resume(device_t dev); static void mvs_dmainit(device_t dev); -static void mvs_dmasetupc_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error); +static void mvs_dmasetupc_cb(void *xsc, + bus_dma_segment_t *segs, int nsegs, int error); static void mvs_dmafini(device_t dev); static void mvs_slotsalloc(device_t dev); static void mvs_slotsfree(device_t dev); @@ -79,7 +80,8 @@ static void mvs_crbq_intr(device_t dev); static void mvs_begin_transaction(device_t dev, union ccb *ccb); static void mvs_legacy_execute_transaction(struct mvs_slot *slot); static void mvs_timeout(struct mvs_slot *slot); -static void mvs_dmasetprd(void *arg, bus_dma_segment_t *segs, int nsegs, int error); +static void mvs_dmasetprd(void *arg, + bus_dma_segment_t *segs, int nsegs, int error); static void mvs_requeue_frozen(device_t dev); static void mvs_execute_transaction(struct mvs_slot *slot); static void mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et); @@ -314,9 +316,11 @@ mvs_dmainit(device_t dev) if (bus_dmamem_alloc(ch->dma.workrq_tag, (void **)&ch->dma.workrq, 0, &ch->dma.workrq_map)) goto error; - if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map, ch->dma.workrq, - MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) { - bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map); + if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map, + ch->dma.workrq, MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) || + dcba.error) { + bus_dmamem_free(ch->dma.workrq_tag, + ch->dma.workrq, ch->dma.workrq_map); goto error; } ch->dma.workrq_bus = dcba.maddr; @@ -329,9 +333,11 @@ mvs_dmainit(device_t dev) if (bus_dmamem_alloc(ch->dma.workrp_tag, (void **)&ch->dma.workrp, 0, &ch->dma.workrp_map)) goto error; - if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map, ch->dma.workrp, - MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) { - bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map); + if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map, + ch->dma.workrp, MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) || + dcba.error) { + bus_dmamem_free(ch->dma.workrp_tag, + ch->dma.workrp, ch->dma.workrp_map); goto error; } ch->dma.workrp_bus = dcba.maddr; @@ -371,7 +377,8 @@ mvs_dmafini(device_t dev) } if (ch->dma.workrp_bus) { bus_dmamap_unload(ch->dma.workrp_tag, ch->dma.workrp_map); - bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map); + bus_dmamem_free(ch->dma.workrp_tag, + ch->dma.workrp, ch->dma.workrp_map); ch->dma.workrp_bus = 0; ch->dma.workrp_map = NULL; ch->dma.workrp = NULL; @@ -382,7 +389,8 @@ mvs_dmafini(device_t dev) } if (ch->dma.workrq_bus) { bus_dmamap_unload(ch->dma.workrq_tag, ch->dma.workrq_map); - bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map); + bus_dmamem_free(ch->dma.workrq_tag, + ch->dma.workrq, ch->dma.workrq_map); ch->dma.workrq_bus = 0; ch->dma.workrq_map = NULL; ch->dma.workrq = NULL; @@ -444,14 +452,16 @@ mvs_setup_edma_queues(device_t dev) ATA_OUTL(ch->r_mem, EDMA_REQQBAH, work >> 32); ATA_OUTL(ch->r_mem, EDMA_REQQIP, work & 0xffffffff); ATA_OUTL(ch->r_mem, EDMA_REQQOP, work & 0xffffffff); - bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map, + BUS_DMASYNC_PREWRITE); /* Reponses queue. */ - bzero(ch->dma.workrp, 256); + memset(ch->dma.workrp, 0xff, MVS_WORKRP_SIZE); work = ch->dma.workrp_bus; ATA_OUTL(ch->r_mem, EDMA_RESQBAH, work >> 32); ATA_OUTL(ch->r_mem, EDMA_RESQIP, work & 0xffffffff); ATA_OUTL(ch->r_mem, EDMA_RESQOP, work & 0xffffffff); - bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, BUS_DMASYNC_PREREAD); + bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, + BUS_DMASYNC_PREREAD); ch->out_idx = 0; ch->in_idx = 0; } @@ -678,20 +688,15 @@ mvs_ch_intr(void *data) int i, ccs, port = -1, selfdis = 0; int edma = (ch->numtslots != 0 || ch->numdslots != 0); -//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n", -// arg->cause, edma, ATA_INL(ch->r_mem, EDMA_IEC)); /* New item in response queue. */ if ((arg->cause & 2) && edma) mvs_crbq_intr(dev); /* Some error or special event. */ if (arg->cause & 1) { iec = ATA_INL(ch->r_mem, EDMA_IEC); -//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n", -// arg->cause, edma, iec); if (iec & EDMA_IE_SERRINT) { serr = ATA_INL(ch->r_mem, SATA_SE); ATA_OUTL(ch->r_mem, SATA_SE, serr); -//device_printf(dev, "SERR %08x\n", serr); } /* EDMA self-disabled due to error. */ if (iec & EDMA_IE_ESELFDIS) @@ -706,7 +711,6 @@ mvs_ch_intr(void *data) fisic = SATA_FISC_FISWAIT4HOSTRDYEN_B1; else /* For Gen-IIe - read FIS interrupt cause. */ fisic = ATA_INL(ch->r_mem, SATA_FISIC); -//device_printf(dev, "FISIC %08x\n", fisic); } if (selfdis) ch->curr_mode = MVS_EDMA_UNKNOWN; @@ -745,7 +749,6 @@ mvs_ch_intr(void *data) } } } -//device_printf(dev, "err slot %d port %d\n", ccs, port); mvs_requeue_frozen(dev); for (i = 0; i < MVS_MAX_SLOTS; i++) { /* XXX: reqests in loading state. */ @@ -771,7 +774,8 @@ mvs_ch_intr(void *data) ch->fatalerr = 1; } } else if (iec & 0xfc1e9000) { - if (ch->numtslots == 0 && i != ccs && port != -2) + if (ch->numtslots == 0 && + i != ccs && port != -2) et = MVS_ERR_INNOCENT; else et = MVS_ERR_SATA; @@ -823,8 +827,6 @@ mvs_legacy_intr(device_t dev) /* Clear interrupt and get status. */ status = mvs_getstatus(dev, 1); -// device_printf(dev, "Legacy intr status %02x\n", -// status); if (slot->state < MVS_SLOT_RUNNING) return; port = ccb->ccb_h.target_id & 0x0f; @@ -867,7 +869,8 @@ mvs_legacy_intr(device_t dev) /* If data write command - put them */ if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) { if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) { - device_printf(dev, "timeout waiting for write DRQ\n"); + device_printf(dev, + "timeout waiting for write DRQ\n"); et = MVS_ERR_TIMEOUT; goto end_finished; } @@ -890,19 +893,18 @@ mvs_legacy_intr(device_t dev) ATA_OUTL(ch->r_mem, DMA_C, 0); goto end_finished; } else { /* ATAPI PIO */ - length = ATA_INB(ch->r_mem,ATA_CYL_LSB) | (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8); + length = ATA_INB(ch->r_mem,ATA_CYL_LSB) | + (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8); ireason = ATA_INB(ch->r_mem,ATA_IREASON); -//device_printf(dev, "status %02x, ireason %02x, length %d\n", status, ireason, length); switch ((ireason & (ATA_I_CMD | ATA_I_IN)) | (status & ATA_S_DRQ)) { case ATAPI_P_CMDOUT: -device_printf(dev, "ATAPI CMDOUT\n"); + device_printf(dev, "ATAPI CMDOUT\n"); /* Return wait for interrupt */ return; case ATAPI_P_WRITE: -//device_printf(dev, "ATAPI WRITE\n"); if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { device_printf(dev, "trying to write on read buffer\n"); et = MVS_ERR_TFE; @@ -920,7 +922,6 @@ device_printf(dev, "ATAPI CMDOUT\n"); return; case ATAPI_P_READ: -//device_printf(dev, "ATAPI READ\n"); if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) { device_printf(dev, "trying to read on write buffer\n"); et = MVS_ERR_TFE; @@ -937,7 +938,6 @@ device_printf(dev, "ATAPI CMDOUT\n"); return; case ATAPI_P_DONEDRQ: -device_printf(dev, "ATAPI DONEDRQ\n"); device_printf(dev, "WARNING - DONEDRQ non conformant device\n"); if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { @@ -958,13 +958,13 @@ device_printf(dev, "ATAPI DONEDRQ\n"); case ATAPI_P_ABORT: case ATAPI_P_DONE: -//device_printf(dev, "ATAPI ABORT/DONE\n"); if (status & (ATA_S_ERROR | ATA_S_DWF)) et = MVS_ERR_TFE; goto end_finished; default: - device_printf(dev, "unknown transfer phase (status %02x, ireason %02x)\n", + device_printf(dev, "unknown transfer phase" + " (status %02x, ireason %02x)\n", status, ireason); et = MVS_ERR_TFE; } @@ -980,38 +980,54 @@ mvs_crbq_intr(device_t dev) struct mvs_channel *ch = device_get_softc(dev); struct mvs_crpb *crpb; union ccb *ccb; - int in_idx, cin_idx, slot; + int in_idx, fin_idx, cin_idx, slot; + uint32_t val; uint16_t flags; - in_idx = (ATA_INL(ch->r_mem, EDMA_RESQIP) & EDMA_RESQP_ERPQP_MASK) >> + val = ATA_INL(ch->r_mem, EDMA_RESQIP); + if (val == 0) + val = ATA_INL(ch->r_mem, EDMA_RESQIP); + in_idx = (val & EDMA_RESQP_ERPQP_MASK) >> EDMA_RESQP_ERPQP_SHIFT; bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, BUS_DMASYNC_POSTREAD); - cin_idx = ch->in_idx; + fin_idx = cin_idx = ch->in_idx; ch->in_idx = in_idx; while (in_idx != cin_idx) { crpb = (struct mvs_crpb *) - (ch->dma.workrp + MVS_CRPB_OFFSET + (MVS_CRPB_SIZE * cin_idx)); + (ch->dma.workrp + MVS_CRPB_OFFSET + + (MVS_CRPB_SIZE * cin_idx)); slot = le16toh(crpb->id) & MVS_CRPB_TAG_MASK; flags = le16toh(crpb->rspflg); -//device_printf(dev, "CRPB %d %d %04x\n", cin_idx, slot, flags); /* * Handle only successfull completions here. * Errors will be handled by main intr handler. */ - if (ch->numtslots != 0 || (flags & EDMA_IE_EDEVERR) == 0) { -if ((flags >> 8) & ATA_S_ERROR) -device_printf(dev, "ERROR STATUS CRPB %d %d %04x\n", cin_idx, slot, flags); + if (crpb->id == 0xffff && crpb->rspflg == 0xffff) { + device_printf(dev, "Unfilled CRPB " + "%d (%d->%d) tag %d flags %04x rs %08x\n", + cin_idx, fin_idx, in_idx, slot, flags, ch->rslots); + } else if (ch->numtslots != 0 || + (flags & EDMA_IE_EDEVERR) == 0) { + crpb->id = 0xffff; + crpb->rspflg = 0xffff; if (ch->slot[slot].state >= MVS_SLOT_RUNNING) { ccb = ch->slot[slot].ccb; - ccb->ataio.res.status = (flags & MVS_CRPB_ATASTS_MASK) >> + ccb->ataio.res.status = + (flags & MVS_CRPB_ATASTS_MASK) >> MVS_CRPB_ATASTS_SHIFT; mvs_end_transaction(&ch->slot[slot], MVS_ERR_NONE); - } else -device_printf(dev, "EMPTY CRPB %d (->%d) %d %04x\n", cin_idx, in_idx, slot, flags); - } else -device_printf(dev, "ERROR FLAGS CRPB %d %d %04x\n", cin_idx, slot, flags); - + } else { + device_printf(dev, "Unused tag in CRPB " + "%d (%d->%d) tag %d flags %04x rs %08x\n", + cin_idx, fin_idx, in_idx, slot, flags, + ch->rslots); + } + } else { + device_printf(dev, + "CRPB with error %d tag %d flags %04x\n", + cin_idx, slot, flags); + } cin_idx = (cin_idx + 1) & (MVS_MAX_SLOTS - 1); } bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, @@ -1266,8 +1282,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot) ch->rslots |= (1 << slot->slot); ATA_OUTB(ch->r_mem, SATA_SATAICTL, port << SATA_SATAICTL_PMPTX_SHIFT); if (ccb->ccb_h.func_code == XPT_ATA_IO) { -// device_printf(dev, "%d Legacy command %02x size %d\n", -// port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len); mvs_tfd_write(dev, ccb); /* Device reset doesn't interrupt. */ if (ccb->ataio.cmd.command == ATA_DEVICE_RESET) { @@ -1287,7 +1301,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot) /* If data write command - output the data */ if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) { if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) { - device_printf(dev, "timeout waiting for write DRQ\n"); + device_printf(dev, + "timeout waiting for write DRQ\n"); mvs_end_transaction(slot, MVS_ERR_TIMEOUT); return; } @@ -1296,9 +1311,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot) ch->transfersize / 2); } } else { -// device_printf(dev, "%d ATAPI command %02x size %d dma %d\n", -// port, ccb->csio.cdb_io.cdb_bytes[0], ccb->csio.dxfer_len, -// ch->basic_dma); ch->donecount = 0; ch->transfersize = min(ccb->csio.dxfer_len, ch->curr[port].bytecount); @@ -1331,7 +1343,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot) DELAY(20); } if (timeout <= 0) { - device_printf(dev, "timeout waiting for ATAPI command ready\n"); + device_printf(dev, + "timeout waiting for ATAPI command ready\n"); mvs_end_transaction(slot, MVS_ERR_TIMEOUT); return; } @@ -1371,8 +1384,6 @@ mvs_execute_transaction(struct mvs_slot *slot) int port = ccb->ccb_h.target_id & 0x0f; int i; -// device_printf(dev, "%d EDMA command %02x size %d slot %d tag %d\n", -// port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len, slot->slot, slot->tag); /* Get address of the prepared EPRD */ eprd = ch->dma.workrq_bus + MVS_EPRD_OFFSET + (MVS_EPRD_SIZE * slot->slot); /* Prepare CRQB. Gen IIe uses different CRQB format. */ @@ -1554,7 +1565,6 @@ mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et) union ccb *ccb = slot->ccb; int lastto; -//device_printf(dev, "cmd done status %d\n", et); bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map, BUS_DMASYNC_POSTWRITE); /* Read result registers to the result struct @@ -1792,7 +1802,8 @@ mvs_process_read_log(device_t dev, union ccb *ccb) if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) device_printf(dev, "Error while READ LOG EXT\n"); else if ((data[0] & 0x80) == 0) { - device_printf(dev, "Non-queued command error in READ LOG EXT\n"); + device_printf(dev, + "Non-queued command error in READ LOG EXT\n"); } for (i = 0; i < MVS_MAX_SLOTS; i++) { if (!ch->hold[i]) diff --git a/sys/dev/mvs/mvs_pci.c b/sys/dev/mvs/mvs_pci.c index 6b18e2c..e2e37da 100644 --- a/sys/dev/mvs/mvs_pci.c +++ b/sys/dev/mvs/mvs_pci.c @@ -339,7 +339,6 @@ mvs_intr(void *data) u_int32_t ic, aic; ic = ATA_INL(ctlr->r_mem, CHIP_MIC); -//device_printf(ctlr->dev, "irq MIC:%08x\n", ic); if (ctlr->msi) { /* We have to to mask MSI during processing. */ mtx_lock(&ctlr->mtx); diff --git a/sys/dev/mvs/mvs_soc.c b/sys/dev/mvs/mvs_soc.c index 03029c2..ed861f2 100644 --- a/sys/dev/mvs/mvs_soc.c +++ b/sys/dev/mvs/mvs_soc.c @@ -295,7 +295,6 @@ mvs_intr(void *data) u_int32_t ic, aic; ic = ATA_INL(ctlr->r_mem, CHIP_SOC_MIC); -//device_printf(ctlr->dev, "irq MIC:%08x\n", ic); if ((ic & IC_HC0) == 0) return; /* Acknowledge interrupts of this HC. */ diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index d1b211a..ef80f81 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -182,6 +182,7 @@ struct pci_quirk { int type; #define PCI_QUIRK_MAP_REG 1 /* PCI map register in weird place */ #define PCI_QUIRK_DISABLE_MSI 2 /* MSI/MSI-X doesn't work */ +#define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI works */ int arg1; int arg2; }; @@ -218,6 +219,12 @@ struct pci_quirk pci_quirks[] = { */ { 0x74501022, PCI_QUIRK_DISABLE_MSI, 0, 0 }, + /* + * Some virtualization environments emulate an older chipset + * but support MSI just fine. QEMU uses the Intel 82440. + */ + { 0x12378086, PCI_QUIRK_ENABLE_MSI_VM, 0, 0 }, + { 0 } }; @@ -257,6 +264,12 @@ SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RW, &pci_do_power_resume, 1, "Transition from D3 -> D0 on resume."); +int pci_do_power_suspend = 1; +TUNABLE_INT("hw.pci.do_power_suspend", &pci_do_power_suspend); +SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RW, + &pci_do_power_suspend, 1, + "Transition from D0 -> D3 on suspend."); + static int pci_do_msi = 1; TUNABLE_INT("hw.pci.enable_msi", &pci_do_msi); SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RW, &pci_do_msi, 1, @@ -594,7 +607,7 @@ pci_read_extcap(device_t pcib, pcicfgregs *cfg) if (cfg->pp.pp_cap == 0) { cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2); cfg->pp.pp_status = ptr + PCIR_POWER_STATUS; - cfg->pp.pp_pmcsr = ptr + PCIR_POWER_PMCSR; + cfg->pp.pp_bse = ptr + PCIR_POWER_BSE; if ((nextptr - ptr) > PCIR_POWER_DATA) cfg->pp.pp_data = ptr + PCIR_POWER_DATA; } @@ -1828,6 +1841,23 @@ pci_msi_device_blacklisted(device_t dev) } /* + * Returns true if a specified chipset supports MSI when it is + * emulated hardware in a virtual machine. + */ +static int +pci_msi_vm_chipset(device_t dev) +{ + struct pci_quirk *q; + + for (q = &pci_quirks[0]; q->devid; q++) { + if (q->devid == pci_get_devid(dev) && + q->type == PCI_QUIRK_ENABLE_MSI_VM) + return (1); + } + return (0); +} + +/* * Determine if MSI is blacklisted globally on this sytem. Currently, * we just check for blacklisted chipsets as represented by the * host-PCI bridge at device 0:0:0. In the future, it may become @@ -1843,8 +1873,14 @@ pci_msi_blacklisted(void) return (0); /* Blacklist all non-PCI-express and non-PCI-X chipsets. */ - if (!(pcie_chipset || pcix_chipset)) + if (!(pcie_chipset || pcix_chipset)) { + if (vm_guest != VM_GUEST_NO) { + dev = pci_find_bsf(0, 0, 0); + if (dev != NULL) + return (pci_msi_vm_chipset(dev) == 0); + } return (1); + } dev = pci_find_bsf(0, 0, 0); if (dev != NULL) @@ -2954,7 +2990,9 @@ pci_suspend(device_t dev) free(devlist, M_TEMP); return (error); } - pci_set_power_children(dev, devlist, numdevs, PCI_POWERSTATE_D3); + if (pci_do_power_suspend) + pci_set_power_children(dev, devlist, numdevs, + PCI_POWERSTATE_D3); free(devlist, M_TEMP); return (0); } @@ -3656,9 +3694,15 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid, res = NULL; pci_read_bar(child, *rid, &map, &testval); - /* Ignore a BAR with a base of 0. */ - if ((*rid == PCIR_BIOS && pci_rombase(testval) == 0) || - pci_mapbase(testval) == 0) + /* + * Determine the size of the BAR and ignore BARs with a size + * of 0. Device ROM BARs use a different mask value. + */ + if (*rid == PCIR_BIOS) + mapsize = pci_romsize(testval); + else + mapsize = pci_mapsize(testval); + if (mapsize == 0) goto out; if (PCI_BAR_MEM(testval) || *rid == PCIR_BIOS) { @@ -3687,13 +3731,7 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid, * actually uses and we would otherwise have a * situation where we might allocate the excess to * another driver, which won't work. - * - * Device ROM BARs use a different mask value. */ - if (*rid == PCIR_BIOS) - mapsize = pci_romsize(testval); - else - mapsize = pci_mapsize(testval); count = 1UL << mapsize; if (RF_ALIGNMENT(flags) < mapsize) flags = (flags & ~RF_ALIGNMENT_MASK) | RF_ALIGNMENT_LOG2(mapsize); diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c index 9992b81..7915818 100644 --- a/sys/dev/pci/pci_pci.c +++ b/sys/dev/pci/pci_pci.c @@ -447,7 +447,7 @@ pcib_suspend(device_t dev) pcib_cfg_save(device_get_softc(dev)); error = bus_generic_suspend(dev); - if (error == 0 && pci_do_power_resume) { + if (error == 0 && pci_do_power_suspend) { dstate = PCI_POWERSTATE_D3; pcib = device_get_parent(device_get_parent(dev)); if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0) diff --git a/sys/dev/pci/pci_private.h b/sys/dev/pci/pci_private.h index 70d887b..90866ef 100644 --- a/sys/dev/pci/pci_private.h +++ b/sys/dev/pci/pci_private.h @@ -39,6 +39,7 @@ DECLARE_CLASS(pci_driver); extern int pci_do_power_resume; +extern int pci_do_power_suspend; void pci_add_children(device_t dev, int domain, int busno, size_t dinfo_size); diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h index a0d12db..02fa7ea 100644 --- a/sys/dev/pci/pcireg.h +++ b/sys/dev/pci/pcireg.h @@ -427,12 +427,16 @@ #define PCIR_POWER_CAP 0x2 #define PCIM_PCAP_SPEC 0x0007 #define PCIM_PCAP_PMEREQCLK 0x0008 -#define PCIM_PCAP_PMEREQPWR 0x0010 #define PCIM_PCAP_DEVSPECINIT 0x0020 -#define PCIM_PCAP_DYNCLOCK 0x0040 -#define PCIM_PCAP_SECCLOCK 0x00c0 -#define PCIM_PCAP_CLOCKMASK 0x00c0 -#define PCIM_PCAP_REQFULLCLOCK 0x0100 +#define PCIM_PCAP_AUXPWR_0 0x0000 +#define PCIM_PCAP_AUXPWR_55 0x0040 +#define PCIM_PCAP_AUXPWR_100 0x0080 +#define PCIM_PCAP_AUXPWR_160 0x00c0 +#define PCIM_PCAP_AUXPWR_220 0x0100 +#define PCIM_PCAP_AUXPWR_270 0x0140 +#define PCIM_PCAP_AUXPWR_320 0x0180 +#define PCIM_PCAP_AUXPWR_375 0x01c0 +#define PCIM_PCAP_AUXPWRMASK 0x01c0 #define PCIM_PCAP_D1SUPP 0x0200 #define PCIM_PCAP_D2SUPP 0x0400 #define PCIM_PCAP_D0PME 0x0800 @@ -447,16 +451,17 @@ #define PCIM_PSTAT_D2 0x0002 #define PCIM_PSTAT_D3 0x0003 #define PCIM_PSTAT_DMASK 0x0003 -#define PCIM_PSTAT_REPENABLE 0x0010 +#define PCIM_PSTAT_NOSOFTRESET 0x0008 #define PCIM_PSTAT_PMEENABLE 0x0100 #define PCIM_PSTAT_D0POWER 0x0000 #define PCIM_PSTAT_D1POWER 0x0200 #define PCIM_PSTAT_D2POWER 0x0400 #define PCIM_PSTAT_D3POWER 0x0600 #define PCIM_PSTAT_D0HEAT 0x0800 -#define PCIM_PSTAT_D1HEAT 0x1000 -#define PCIM_PSTAT_D2HEAT 0x1200 -#define PCIM_PSTAT_D3HEAT 0x1400 +#define PCIM_PSTAT_D1HEAT 0x0a00 +#define PCIM_PSTAT_D2HEAT 0x0c00 +#define PCIM_PSTAT_D3HEAT 0x0e00 +#define PCIM_PSTAT_DATASELMASK 0x1e00 #define PCIM_PSTAT_DATAUNKN 0x0000 #define PCIM_PSTAT_DATADIV10 0x2000 #define PCIM_PSTAT_DATADIV100 0x4000 @@ -464,11 +469,10 @@ #define PCIM_PSTAT_DATADIVMASK 0x6000 #define PCIM_PSTAT_PME 0x8000 -#define PCIR_POWER_PMCSR 0x6 -#define PCIM_PMCSR_DCLOCK 0x10 -#define PCIM_PMCSR_B2SUPP 0x20 -#define PCIM_BMCSR_B3SUPP 0x40 -#define PCIM_BMCSR_BPCE 0x80 +#define PCIR_POWER_BSE 0x6 +#define PCIM_PMCSR_BSE_D3B3 0x00 +#define PCIM_PMCSR_BSE_D3B2 0x40 +#define PCIM_PMCSR_BSE_BPCCE 0x80 #define PCIR_POWER_DATA 0x7 diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h index d6a2a0e..aee967a 100644 --- a/sys/dev/pci/pcivar.h +++ b/sys/dev/pci/pcivar.h @@ -42,9 +42,9 @@ typedef uint64_t pci_addr_t; /* Interesting values for PCI power management */ struct pcicfg_pp { uint16_t pp_cap; /* PCI power management capabilities */ - uint8_t pp_status; /* config space address of PCI power status reg */ - uint8_t pp_pmcsr; /* config space address of PMCSR reg */ - uint8_t pp_data; /* config space address of PCI power data reg */ + uint8_t pp_status; /* conf. space addr. of PM control/status reg */ + uint8_t pp_bse; /* conf. space addr. of PM BSE reg */ + uint8_t pp_data; /* conf. space addr. of PM data reg */ }; struct vpd_readonly { diff --git a/sys/dev/sis/if_sis.c b/sys/dev/sis/if_sis.c index 93246d3..0957419 100644 --- a/sys/dev/sis/if_sis.c +++ b/sys/dev/sis/if_sis.c @@ -1795,12 +1795,15 @@ sis_intr(void *arg) if ((status & SIS_INTRS) == 0) { /* Not ours. */ SIS_UNLOCK(sc); + return; } /* Disable interrupts. */ CSR_WRITE_4(sc, SIS_IER, 0); for (;(status & SIS_INTRS) != 0;) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + break; if (status & (SIS_ISR_TX_DESC_OK | SIS_ISR_TX_ERR | SIS_ISR_TX_OK | SIS_ISR_TX_IDLE) ) @@ -1825,11 +1828,13 @@ sis_intr(void *arg) status = CSR_READ_4(sc, SIS_ISR); } - /* Re-enable interrupts. */ - CSR_WRITE_4(sc, SIS_IER, 1); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* Re-enable interrupts. */ + CSR_WRITE_4(sc, SIS_IER, 1); - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - sis_startl(ifp); + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + sis_startl(ifp); + } SIS_UNLOCK(sc); } diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c index 6948173..eb55dfc 100644 --- a/sys/dev/xen/balloon/balloon.c +++ b/sys/dev/xen/balloon/balloon.c @@ -44,7 +44,7 @@ __FBSDID("$FreeBSD$"); #include <machine/xen/xenfunc.h> #include <machine/xen/xenvar.h> #include <xen/hypervisor.h> -#include <xen/xenbus/xenbusvar.h> +#include <xen/xenstore/xenstorevar.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -406,20 +406,20 @@ set_new_target(unsigned long target) wakeup(balloon_process); } -static struct xenbus_watch target_watch = +static struct xs_watch target_watch = { .node = "memory/target" }; /* React to a change in the target key */ static void -watch_target(struct xenbus_watch *watch, +watch_target(struct xs_watch *watch, const char **vec, unsigned int len) { unsigned long long new_target; int err; - err = xenbus_scanf(XBT_NIL, "memory", "target", NULL, + err = xs_scanf(XST_NIL, "memory", "target", NULL, "%llu", &new_target); if (err) { /* This is ok (for domain0 at least) - so just return */ @@ -438,7 +438,7 @@ balloon_init_watcher(void *arg) { int err; - err = register_xenbus_watch(&target_watch); + err = xs_register_watch(&target_watch); if (err) printf("Failed to set balloon watcher\n"); diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c index 259f2f6..72087f5 100644 --- a/sys/dev/xen/blkback/blkback.c +++ b/sys/dev/xen/blkback/blkback.c @@ -1,1055 +1,1919 @@ -/* - * Copyright (c) 2006, Cisco Systems, Inc. +/*- + * Copyright (c) 2009-2010 Spectra Logic Corporation * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * Ken Merry (Spectra Logic Corporation) */ - #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +/** + * \file blkback.c + * + * \brief Device driver supporting the vending of block storage from + * a FreeBSD domain to other domains. + */ + #include <sys/param.h> #include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/malloc.h> #include <sys/kernel.h> -#include <sys/socket.h> -#include <sys/queue.h> -#include <sys/taskqueue.h> +#include <sys/malloc.h> + +#include <sys/bio.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/devicestat.h> +#include <sys/disk.h> +#include <sys/fcntl.h> +#include <sys/filedesc.h> +#include <sys/kdb.h> +#include <sys/module.h> #include <sys/namei.h> #include <sys/proc.h> -#include <sys/filedesc.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/types.h> #include <sys/vnode.h> -#include <sys/fcntl.h> -#include <sys/disk.h> -#include <sys/bio.h> - -#include <sys/module.h> -#include <sys/bus.h> -#include <sys/sysctl.h> +#include <sys/mount.h> #include <geom/geom.h> +#include <machine/_inttypes.h> +#include <machine/xen/xen-os.h> + +#include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_kern.h> -#include <machine/xen-os.h> -#include <machine/hypervisor.h> -#include <machine/hypervisor-ifs.h> -#include <machine/xen_intr.h> -#include <machine/evtchn.h> -#include <machine/xenbus.h> -#include <machine/gnttab.h> -#include <machine/xen-public/memory.h> -#include <dev/xen/xenbus/xenbus_comms.h> +#include <xen/blkif.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/xen_intr.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/grant_table.h> -#if XEN_BLKBACK_DEBUG -#define DPRINTF(fmt, args...) \ - printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) -#else -#define DPRINTF(fmt, args...) ((void)0) -#endif - -#define WPRINTF(fmt, args...) \ - printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#include <xen/xenbus/xenbusvar.h> -#define BLKBACK_INVALID_HANDLE (~0) +/*--------------------------- Compile-time Tunables --------------------------*/ +/** + * The maximum number of outstanding request blocks (request headers plus + * additional segment blocks) we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBB_MAX_REQUESTS 256 -struct ring_ref { - vm_offset_t va; - grant_handle_t handle; - uint64_t bus_addr; -}; +/** + * \brief Define to force all I/O to be performed on memory owned by the + * backend device, with a copy-in/out to the remote domain's memory. + * + * \note This option is currently required when this driver's domain is + * operating in HVM mode on a system using an IOMMU. + * + * This driver uses Xen's grant table API to gain access to the memory of + * the remote domains it serves. When our domain is operating in PV mode, + * the grant table mechanism directly updates our domain's page table entries + * to point to the physical pages of the remote domain. This scheme guarantees + * that blkback and the backing devices it uses can safely perform DMA + * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to + * insure that our domain cannot DMA to pages owned by another domain. As + * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant + * table API. For this reason, in HVM mode, we must bounce all requests into + * memory that is mapped into our domain at domain startup and thus has + * valid IOMMU mappings. + */ +#define XBB_USE_BOUNCE_BUFFERS -typedef struct blkback_info { +/** + * \brief Define to enable rudimentary request logging to the console. + */ +#undef XBB_DEBUG - /* Schedule lists */ - STAILQ_ENTRY(blkback_info) next_req; - int on_req_sched_list; +/*---------------------------------- Macros ----------------------------------*/ +/** + * Custom malloc type for all driver allocations. + */ +MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); - struct xenbus_device *xdev; - XenbusState frontend_state; +#ifdef XBB_DEBUG +#define DPRINTF(fmt, args...) \ + printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) do {} while(0) +#endif - domid_t domid; +/** + * The maximum mapped region size per request we will allow in a negotiated + * block-front/back communication channel. + */ +#define XBB_MAX_REQUEST_SIZE \ + MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) - int state; - int ring_connected; - struct ring_ref rr; - blkif_back_ring_t ring; - evtchn_port_t evtchn; - int irq; - void *irq_cookie; +/** + * The maximum number of segments (within a request header and accompanying + * segment blocks) per request we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBB_MAX_SEGMENTS_PER_REQUEST \ + (MIN(UIO_MAXIOV, \ + MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ + (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) + +/** + * The maximum number of shared memory ring pages we will allow in a + * negotiated block-front/back communication channel. Allow enough + * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. + */ +#define XBB_MAX_RING_PAGES \ + BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ + * XBB_MAX_REQUESTS) - int ref_cnt; +/*--------------------------- Forward Declarations ---------------------------*/ +struct xbb_softc; - int handle; - char *mode; - char *type; - char *dev_name; +static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, + ...) __attribute__((format(printf, 3, 4))); +static int xbb_shutdown(struct xbb_softc *xbb); +static int xbb_detach(device_t dev); - struct vnode *vn; - struct cdev *cdev; - struct cdevsw *csw; - u_int sector_size; - int sector_size_shift; - off_t media_size; - u_int media_num_sectors; - int major; - int minor; - int read_only; - - struct mtx blk_ring_lock; - - device_t ndev; - - /* Stats */ - int st_rd_req; - int st_wr_req; - int st_oo_req; - int st_err_req; -} blkif_t; - -/* - * These are rather arbitrary. They are fairly large because adjacent requests - * pulled from a communication ring are quite likely to end up being part of - * the same scatter/gather request at the disc. - * - * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** - * - * This will increase the chances of being able to write whole tracks. - * 64 should be enough to keep us competitive with Linux. +/*------------------------------ Data Structures -----------------------------*/ +/** + * \brief Object tracking an in-flight I/O from a Xen VBD consumer. */ -static int blkif_reqs = 64; -TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs); +struct xbb_xen_req { + /** + * Linked list links used to aggregate idle request in the + * request free pool (xbb->request_free_slist). + */ + SLIST_ENTRY(xbb_xen_req) links; + + /** + * Back reference to the parent block back instance for this + * request. Used during bio_done handling. + */ + struct xbb_softc *xbb; + + /** + * The remote domain's identifier for this I/O request. + */ + uint64_t id; + + /** + * Kernel virtual address space reserved for this request + * structure and used to map the remote domain's pages for + * this I/O, into our domain's address space. + */ + uint8_t *kva; + +#ifdef XBB_USE_BOUNCE_BUFFERS + /** + * Pre-allocated domain local memory used to proxy remote + * domain memory during I/O operations. + */ + uint8_t *bounce; +#endif -static int mmap_pages; + /** + * Base, psuedo-physical address, corresponding to the start + * of this request's kva region. + */ + uint64_t gnt_base; + + /** + * The number of pages currently mapped for this request. + */ + int nr_pages; + + /** + * The number of 512 byte sectors comprising this requests. + */ + int nr_512b_sectors; + + /** + * The number of struct bio requests still outstanding for this + * request on the backend device. This field is only used for + * device (rather than file) backed I/O. + */ + int pendcnt; + + /** + * BLKIF_OP code for this request. + */ + int operation; + + /** + * BLKIF_RSP status code for this request. + * + * This field allows an error status to be recorded even if the + * delivery of this status must be deferred. Deferred reporting + * is necessary, for example, when an error is detected during + * completion processing of one bio when other bios for this + * request are still outstanding. + */ + int status; + + /** + * Device statistics request ordering type (ordered or simple). + */ + devstat_tag_type ds_tag_type; + + /** + * Device statistics request type (read, write, no_data). + */ + devstat_trans_flags ds_trans_type; + + /** + * The start time for this request. + */ + struct bintime ds_t0; + + /** + * Array of grant handles (one per page) used to map this request. + */ + grant_handle_t *gnt_handles; +}; +SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); -/* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. Each buffer_head that completes decrements - * the pendcnt towards zero. When it hits zero, the specified domain has a - * response queued for it, with the saved 'id' passed back. +/** + * \brief Configuration data for the shared memory request ring + * used to communicate with the front-end client of this + * this driver. */ -typedef struct pending_req { - blkif_t *blkif; - uint64_t id; - int nr_pages; - int pendcnt; - unsigned short operation; - int status; - STAILQ_ENTRY(pending_req) free_list; -} pending_req_t; - -static pending_req_t *pending_reqs; -static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free = - STAILQ_HEAD_INITIALIZER(pending_free); -static struct mtx pending_free_lock; - -static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list = - STAILQ_HEAD_INITIALIZER(req_sched_list); -static struct mtx req_sched_list_lock; - -static unsigned long mmap_vstart; -static unsigned long *pending_vaddrs; -static grant_handle_t *pending_grant_handles; - -static struct task blk_req_task; - -/* Protos */ -static void disconnect_ring(blkif_t *blkif); -static int vbd_add_dev(struct xenbus_device *xdev); - -static inline int vaddr_pagenr(pending_req_t *req, int seg) -{ - return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; -} - -static inline unsigned long vaddr(pending_req_t *req, int seg) -{ - return pending_vaddrs[vaddr_pagenr(req, seg)]; -} - -#define pending_handle(_req, _seg) \ - (pending_grant_handles[vaddr_pagenr(_req, _seg)]) +struct xbb_ring_config { + /** KVA address where ring memory is mapped. */ + vm_offset_t va; + + /** The pseudo-physical address where ring memory is mapped.*/ + uint64_t gnt_addr; + + /** + * Grant table handles, one per-ring page, returned by the + * hyperpervisor upon mapping of the ring and required to + * unmap it when a connection is torn down. + */ + grant_handle_t handle[XBB_MAX_RING_PAGES]; + + /** + * The device bus address returned by the hypervisor when + * mapping the ring and required to unmap it when a connection + * is torn down. + */ + uint64_t bus_addr[XBB_MAX_RING_PAGES]; + + /** The number of ring pages mapped for the current connection. */ + u_int ring_pages; + + /** + * The grant references, one per-ring page, supplied by the + * front-end, allowing us to reference the ring pages in the + * front-end's domain and to map these pages into our own domain. + */ + grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; + + /** The interrupt driven even channel used to signal ring events. */ + evtchn_port_t evtchn; +}; -static unsigned long -alloc_empty_page_range(unsigned long nr_pages) +/** + * Per-instance connection state flags. + */ +typedef enum { - void *pages; - int i = 0, j = 0; - multicall_entry_t mcl[17]; - unsigned long mfn_list[16]; - struct xen_memory_reservation reservation = { - .extent_start = mfn_list, - .nr_extents = 0, - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - - pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); - if (pages == NULL) - return 0; + /** + * The front-end requested a read-only mount of the + * back-end device/file. + */ + XBBF_READ_ONLY = 0x01, + + /** Communication with the front-end has been established. */ + XBBF_RING_CONNECTED = 0x02, + + /** + * Front-end requests exist in the ring and are waiting for + * xbb_xen_req objects to free up. + */ + XBBF_RESOURCE_SHORTAGE = 0x04, + + /** Connection teardown in progress. */ + XBBF_SHUTDOWN = 0x08 +} xbb_flag_t; + +/** Backend device type. */ +typedef enum { + /** Backend type unknown. */ + XBB_TYPE_NONE = 0x00, + + /** + * Backend type disk (access via cdev switch + * strategy routine). + */ + XBB_TYPE_DISK = 0x01, + + /** Backend type file (access vnode operations.). */ + XBB_TYPE_FILE = 0x02 +} xbb_type; + +/** + * \brief Structure used to memoize information about a per-request + * scatter-gather list. + * + * The chief benefit of using this data structure is it avoids having + * to reparse the possibly discontiguous S/G list in the original + * request. Due to the way that the mapping of the memory backing an + * I/O transaction is handled by Xen, a second pass is unavoidable. + * At least this way the second walk is a simple array traversal. + * + * \note A single Scatter/Gather element in the block interface covers + * at most 1 machine page. In this context a sector (blkif + * nomenclature, not what I'd choose) is a 512b aligned unit + * of mapping within the machine page referenced by an S/G + * element. + */ +struct xbb_sg { + /** The number of 512b data chunks mapped in this S/G element. */ + int16_t nsect; + + /** + * The index (0 based) of the first 512b data chunk mapped + * in this S/G element. + */ + uint8_t first_sect; + + /** + * The index (0 based) of the last 512b data chunk mapped + * in this S/G element. + */ + uint8_t last_sect; +}; - memset(mcl, 0, sizeof(mcl)); +/** + * Character device backend specific configuration data. + */ +struct xbb_dev_data { + /** Cdev used for device backend access. */ + struct cdev *cdev; - while (i < nr_pages) { - unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + /** Cdev switch used for device backend access. */ + struct cdevsw *csw; - mcl[j].op = __HYPERVISOR_update_va_mapping; - mcl[j].args[0] = va; + /** Used to hold a reference on opened cdev backend devices. */ + int dev_ref; +}; - mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; +/** + * File backend specific configuration data. + */ +struct xbb_file_data { + /** Credentials to use for vnode backed (file based) I/O. */ + struct ucred *cred; + + /** + * \brief Array of io vectors used to process file based I/O. + * + * Only a single file based request is outstanding per-xbb instance, + * so we only need one of these. + */ + struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; +#ifdef XBB_USE_BOUNCE_BUFFERS + + /** + * \brief Array of io vectors used to handle bouncing of file reads. + * + * Vnode operations are free to modify uio data during their + * exectuion. In the case of a read with bounce buffering active, + * we need some of the data from the original uio in order to + * bounce-out the read data. This array serves as the temporary + * storage for this saved data. + */ + struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; + + /** + * \brief Array of memoized bounce buffer kva offsets used + * in the file based backend. + * + * Due to the way that the mapping of the memory backing an + * I/O transaction is handled by Xen, a second pass through + * the request sg elements is unavoidable. We memoize the computed + * bounce address here to reduce the cost of the second walk. + */ + void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST]; +#endif /* XBB_USE_BOUNCE_BUFFERS */ +}; - xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; +/** + * Collection of backend type specific data. + */ +union xbb_backend_data { + struct xbb_dev_data dev; + struct xbb_file_data file; +}; - if (j == 16 || i == nr_pages) { - mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; +/** + * Function signature of backend specific I/O handlers. + */ +typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, + int operation, int flags); - reservation.nr_extents = j; +/** + * Per-instance configuration data. + */ +struct xbb_softc { + + /** + * Task-queue used to process I/O requests. + */ + struct taskqueue *io_taskqueue; + + /** + * Single "run the request queue" task enqueued + * on io_taskqueue. + */ + struct task io_task; + + /** Device type for this instance. */ + xbb_type device_type; + + /** NewBus device corresponding to this instance. */ + device_t dev; + + /** Backend specific dispatch routine for this instance. */ + xbb_dispatch_t dispatch_io; + + /** The number of requests outstanding on the backend device/file. */ + u_int active_request_count; + + /** Free pool of request tracking structures. */ + struct xbb_xen_req_slist request_free_slist; + + /** Array, sized at connection time, of request tracking structures. */ + struct xbb_xen_req *requests; + + /** + * Global pool of kva used for mapping remote domain ring + * and I/O transaction data. + */ + vm_offset_t kva; + + /** Psuedo-physical address corresponding to kva. */ + uint64_t gnt_base_addr; + + /** The size of the global kva pool. */ + int kva_size; + + /** + * \brief Cached value of the front-end's domain id. + * + * This value is used at once for each mapped page in + * a transaction. We cache it to avoid incuring the + * cost of an ivar access every time this is needed. + */ + domid_t otherend_id; + + /** + * \brief The blkif protocol abi in effect. + * + * There are situations where the back and front ends can + * have a different, native abi (e.g. intel x86_64 and + * 32bit x86 domains on the same machine). The back-end + * always accomodates the front-end's native abi. That + * value is pulled from the XenStore and recorded here. + */ + int abi; + + /** + * \brief The maximum number of requests allowed to be in + * flight at a time. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_requests; + + /** + * \brief The maximum number of segments (1 page per segment) + * that can be mapped by a request. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_request_segments; + + /** + * The maximum size of any request to this back-end + * device. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_request_size; + + /** Various configuration and state bit flags. */ + xbb_flag_t flags; + + /** Ring mapping and interrupt configuration data. */ + struct xbb_ring_config ring_config; + + /** Runtime, cross-abi safe, structures for ring access. */ + blkif_back_rings_t rings; + + /** IRQ mapping for the communication ring event channel. */ + int irq; + + /** + * \brief Backend access mode flags (e.g. write, or read-only). + * + * This value is passed to us by the front-end via the XenStore. + */ + char *dev_mode; + + /** + * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). + * + * This value is passed to us by the front-end via the XenStore. + * Currently unused. + */ + char *dev_type; + + /** + * \brief Backend device/file identifier. + * + * This value is passed to us by the front-end via the XenStore. + * We expect this to be a POSIX path indicating the file or + * device to open. + */ + char *dev_name; + + /** + * Vnode corresponding to the backend device node or file + * we are acessing. + */ + struct vnode *vn; + + union xbb_backend_data backend; + /** The native sector size of the backend. */ + u_int sector_size; + + /** log2 of sector_size. */ + u_int sector_size_shift; + + /** Size in bytes of the backend device or file. */ + off_t media_size; + + /** + * \brief media_size expressed in terms of the backend native + * sector size. + * + * (e.g. xbb->media_size >> xbb->sector_size_shift). + */ + uint64_t media_num_sectors; + + /** + * \brief Array of memoized scatter gather data computed during the + * conversion of blkif ring requests to internal xbb_xen_req + * structures. + * + * Ring processing is serialized so we only need one of these. + */ + struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST]; + + /** Mutex protecting per-instance data. */ + struct mtx lock; + +#ifdef XENHVM + /** + * Resource representing allocated physical address space + * associated with our per-instance kva region. + */ + struct resource *pseudo_phys_res; + + /** Resource id for allocated physical address space. */ + int pseudo_phys_res_id; +#endif - mcl[j].op = __HYPERVISOR_memory_op; - mcl[j].args[0] = XENMEM_decrease_reservation; - mcl[j].args[1] = (unsigned long)&reservation; - - (void)HYPERVISOR_multicall(mcl, j+1); + /** I/O statistics. */ + struct devstat *xbb_stats; +}; - mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; - j = 0; +/*---------------------------- Request Processing ----------------------------*/ +/** + * Allocate an internal transaction tracking structure from the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return On success, a pointer to the allocated xbb_xen_req structure. + * Otherwise NULL. + */ +static inline struct xbb_xen_req * +xbb_get_req(struct xbb_softc *xbb) +{ + struct xbb_xen_req *req; + + req = NULL; + mtx_lock(&xbb->lock); + + /* + * Do not allow new requests to be allocated while we + * are shutting down. + */ + if ((xbb->flags & XBBF_SHUTDOWN) == 0) { + if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) { + SLIST_REMOVE_HEAD(&xbb->request_free_slist, links); + xbb->active_request_count++; + } else { + xbb->flags |= XBBF_RESOURCE_SHORTAGE; } } - - return (unsigned long)pages; + mtx_unlock(&xbb->lock); + return (req); } -static pending_req_t * -alloc_req(void) -{ - pending_req_t *req; - mtx_lock(&pending_free_lock); - if ((req = STAILQ_FIRST(&pending_free))) { - STAILQ_REMOVE(&pending_free, req, pending_req, free_list); - STAILQ_NEXT(req, free_list) = NULL; - } - mtx_unlock(&pending_free_lock); - return req; -} - -static void -free_req(pending_req_t *req) +/** + * Return an allocated transaction tracking structure to the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * \param req The request structure to free. + */ +static inline void +xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { - int was_empty; - - mtx_lock(&pending_free_lock); - was_empty = STAILQ_EMPTY(&pending_free); - STAILQ_INSERT_TAIL(&pending_free, req, free_list); - mtx_unlock(&pending_free_lock); - if (was_empty) - taskqueue_enqueue(taskqueue_swi, &blk_req_task); -} + int wake_thread; -static void -fast_flush_area(pending_req_t *req) -{ - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int i, invcount = 0; - grant_handle_t handle; - int ret; + mtx_lock(&xbb->lock); + wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE; + xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; + SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); + xbb->active_request_count--; - for (i = 0; i < req->nr_pages; i++) { - handle = pending_handle(req, i); - if (handle == BLKBACK_INVALID_HANDLE) - continue; - unmap[invcount].host_addr = vaddr(req, i); - unmap[invcount].dev_bus_addr = 0; - unmap[invcount].handle = handle; - pending_handle(req, i) = BLKBACK_INVALID_HANDLE; - invcount++; + if ((xbb->flags & XBBF_SHUTDOWN) != 0) { + /* + * Shutdown is in progress. See if we can + * progress further now that one more request + * has completed and been returned to the + * free pool. + */ + xbb_shutdown(xbb); } + mtx_unlock(&xbb->lock); - ret = HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount); - PANIC_IF(ret); + if (wake_thread != 0) + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } -static void -blkif_get(blkif_t *blkif) +/** + * Given a page index and 512b sector offset within that page, + * calculate an offset into a request's kva region. + * + * \param req The request structure whose kva region will be accessed. + * \param pagenr The page index used to compute the kva offset. + * \param sector The 512b sector index used to compute the page relative + * kva offset. + * + * \return The computed global KVA offset. + */ +static inline uint8_t * +xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) { - atomic_add_int(&blkif->ref_cnt, 1); + return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } -static void -blkif_put(blkif_t *blkif) +#ifdef XBB_USE_BOUNCE_BUFFERS +/** + * Given a page index and 512b sector offset within that page, + * calculate an offset into a request's local bounce memory region. + * + * \param req The request structure whose bounce region will be accessed. + * \param pagenr The page index used to compute the bounce offset. + * \param sector The 512b sector index used to compute the page relative + * bounce offset. + * + * \return The computed global bounce buffer address. + */ +static inline uint8_t * +xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) { - if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) { - DPRINTF("Removing %x\n", (unsigned int)blkif); - disconnect_ring(blkif); - if (blkif->mode) - free(blkif->mode, M_DEVBUF); - if (blkif->type) - free(blkif->type, M_DEVBUF); - if (blkif->dev_name) - free(blkif->dev_name, M_DEVBUF); - free(blkif, M_DEVBUF); - } + return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } +#endif -static int -blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params) +/** + * Given a page number and 512b sector offset within that page, + * calculate an offset into the request's memory region that the + * underlying backend device/file should use for I/O. + * + * \param req The request structure whose I/O region will be accessed. + * \param pagenr The page index used to compute the I/O offset. + * \param sector The 512b sector index used to compute the page relative + * I/O offset. + * + * \return The computed global I/O address. + * + * Depending on configuration, this will either be a local bounce buffer + * or a pointer to the memory mapped in from the front-end domain for + * this request. + */ +static inline uint8_t * +xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) { - blkif_t *blkif; - - blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!blkif) - return ENOMEM; - - DPRINTF("Created %x\n", (unsigned int)blkif); - - blkif->ref_cnt = 1; - blkif->domid = xdev->otherend_id; - blkif->handle = handle; - blkif->mode = mode; - blkif->type = type; - blkif->dev_name = params; - blkif->xdev = xdev; - xdev->data = blkif; - - mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF); - - if (strcmp(mode, "w")) - blkif->read_only = 1; - - return 0; +#ifdef XBB_USE_BOUNCE_BUFFERS + return (xbb_req_bounce_addr(req, pagenr, sector)); +#else + return (xbb_req_vaddr(req, pagenr, sector)); +#endif } -static void -add_to_req_schedule_list_tail(blkif_t *blkif) +/** + * Given a page index and 512b sector offset within that page, calculate + * an offset into the local psuedo-physical address space used to map a + * front-end's request data into a request. + * + * \param req The request structure whose pseudo-physical region + * will be accessed. + * \param pagenr The page index used to compute the pseudo-physical offset. + * \param sector The 512b sector index used to compute the page relative + * pseudo-physical offset. + * + * \return The computed global pseudo-phsyical address. + * + * Depending on configuration, this will either be a local bounce buffer + * or a pointer to the memory mapped in from the front-end domain for + * this request. + */ +static inline uintptr_t +xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector) { - if (!blkif->on_req_sched_list) { - mtx_lock(&req_sched_list_lock); - if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) { - blkif_get(blkif); - STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); - blkif->on_req_sched_list = 1; - taskqueue_enqueue(taskqueue_swi, &blk_req_task); - } - mtx_unlock(&req_sched_list_lock); - } + return ((uintptr_t)(req->gnt_base + + (PAGE_SIZE * pagenr) + (sector << 9))); } -/* This routine does not call blkif_get(), does not schedule the blk_req_task to run, - and assumes that the state is connected */ +/** + * Unmap the front-end pages associated with this I/O request. + * + * \param req The request structure to unmap. + */ static void -add_to_req_schedule_list_tail2(blkif_t *blkif) +xbb_unmap_req(struct xbb_xen_req *req) { - mtx_lock(&req_sched_list_lock); - if (!blkif->on_req_sched_list) { - STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); - blkif->on_req_sched_list = 1; - } - mtx_unlock(&req_sched_list_lock); -} + struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST]; + u_int i; + u_int invcount; + int error; -/* Removes blkif from front of list and does not call blkif_put() (caller must) */ -static blkif_t * -remove_from_req_schedule_list(void) -{ - blkif_t *blkif; + invcount = 0; + for (i = 0; i < req->nr_pages; i++) { - mtx_lock(&req_sched_list_lock); + if (req->gnt_handles[i] == GRANT_REF_INVALID) + continue; - if ((blkif = STAILQ_FIRST(&req_sched_list))) { - STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req); - STAILQ_NEXT(blkif, next_req) = NULL; - blkif->on_req_sched_list = 0; + unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0); + unmap[invcount].dev_bus_addr = 0; + unmap[invcount].handle = req->gnt_handles[i]; + req->gnt_handles[i] = GRANT_REF_INVALID; + invcount++; } - mtx_unlock(&req_sched_list_lock); - - return blkif; + error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, invcount); + KASSERT(error == 0, ("Grant table operation failed")); } +/** + * Create and transmit a response to a blkif request. + * + * \param xbb Per-instance xbb configuration structure. + * \param req The request structure to which to respond. + * \param status The status code to report. See BLKIF_RSP_* + * in sys/xen/interface/io/blkif.h. + */ static void -make_response(blkif_t *blkif, uint64_t id, - unsigned short op, int st) +xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) { blkif_response_t *resp; - blkif_back_ring_t *blk_ring = &blkif->ring; - int more_to_do = 0; - int notify; + int more_to_do; + int notify; + + more_to_do = 0; + + /* + * Place on the response ring for the relevant domain. + * For now, only the spacing between entries is different + * in the different ABIs, not the response entry layout. + */ + mtx_lock(&xbb->lock); + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + resp = RING_GET_RESPONSE(&xbb->rings.native, + xbb->rings.native.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_32: + resp = (blkif_response_t *) + RING_GET_RESPONSE(&xbb->rings.x86_32, + xbb->rings.x86_32.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_64: + resp = (blkif_response_t *) + RING_GET_RESPONSE(&xbb->rings.x86_64, + xbb->rings.x86_64.rsp_prod_pvt); + break; + default: + panic("Unexpected blkif protocol ABI."); + } - mtx_lock(&blkif->blk_ring_lock); + resp->id = req->id; + resp->operation = req->operation; + resp->status = status; + xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); - /* Place on the response ring for the relevant domain. */ - resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - blk_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); + if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { - if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ - RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); + RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { - } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) more_to_do = 1; + } - mtx_unlock(&blkif->blk_ring_lock); + mtx_unlock(&xbb->lock); if (more_to_do) - add_to_req_schedule_list_tail(blkif); + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(xbb->irq); } +/** + * Completion handler for buffer I/O requests issued by the device + * backend driver. + * + * \param bio The buffer I/O request on which to perform completion + * processing. + */ static void -end_block_io_op(struct bio *bio) +xbb_bio_done(struct bio *bio) { - pending_req_t *pending_req = bio->bio_caller2; + struct xbb_softc *xbb; + struct xbb_xen_req *req; + + req = bio->bio_caller1; + xbb = req->xbb; + /* Only include transferred I/O in stats. */ + req->nr_512b_sectors -= bio->bio_resid >> 9; if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", - bio->bio_error, pending_req->blkif->dev_name); - pending_req->status = BLKIF_RSP_ERROR; - pending_req->blkif->st_err_req++; + bio->bio_error, xbb->dev_name); + req->status = BLKIF_RSP_ERROR; + + if (bio->bio_error == ENXIO + && xenbus_get_state(xbb->dev) == XenbusStateConnected) { + + /* + * Backend device has disappeared. Signal the + * front-end that we (the device proxy) want to + * go away. + */ + xenbus_set_state(xbb->dev, XenbusStateClosing); + } } -#if 0 - printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n", - (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags); -#endif +#ifdef XBB_USE_BOUNCE_BUFFERS + if (bio->bio_cmd == BIO_READ) { + vm_offset_t kva_offset; - if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) { - fast_flush_area(pending_req); - make_response(pending_req->blkif, pending_req->id, - pending_req->operation, pending_req->status); - blkif_put(pending_req->blkif); - free_req(pending_req); + kva_offset = (vm_offset_t)bio->bio_data + - (vm_offset_t)req->bounce; + memcpy((uint8_t *)req->kva + kva_offset, + bio->bio_data, bio->bio_bcount); + } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + + if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) { + xbb_unmap_req(req); + xbb_send_response(xbb, req, req->status); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/req->nr_512b_sectors << 9, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); + xbb_release_req(xbb, req); } g_destroy_bio(bio); } +/** + * Parse a blkif request into an internal request structure and send + * it to the backend for processing. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param req_ring_idx The location of ring_req within the shared + * communication ring. + * + * This routine performs the backend common aspects of request parsing + * including compiling an internal request structure, parsing the S/G + * list and any secondary ring requests in which they may reside, and + * the mapping of front-end I/O pages into our domain. + */ static void -dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req) +xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, RING_IDX req_ring_idx) { - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct { - unsigned long buf; unsigned int nsec; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int nseg = req->nr_segments, nr_sects = 0; - struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int operation, ret, i, nbio = 0; + struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct xbb_sg *xbb_sg; + struct gnttab_map_grant_ref *map; + struct blkif_request_segment *sg; + struct blkif_request_segment *last_block_sg; + u_int nseg; + u_int seg_idx; + u_int block_segs; + int nr_sects; + int operation; + uint8_t bio_flags; + int error; + + nseg = ring_req->nr_segments; + nr_sects = 0; + req->xbb = xbb; + req->id = ring_req->id; + req->operation = ring_req->operation; + req->status = BLKIF_RSP_OKAY; + req->ds_tag_type = DEVSTAT_TAG_SIMPLE; + req->nr_pages = nseg; + req->nr_512b_sectors = 0; + bio_flags = 0; + sg = NULL; + + binuptime(&req->ds_t0); + devstat_start_transaction(xbb->xbb_stats, &req->ds_t0); + + switch (req->operation) { + case BLKIF_OP_WRITE_BARRIER: + bio_flags |= BIO_ORDERED; + req->ds_tag_type = DEVSTAT_TAG_ORDERED; + /* FALLTHROUGH */ + case BLKIF_OP_WRITE: + operation = BIO_WRITE; + req->ds_trans_type = DEVSTAT_WRITE; + if ((xbb->flags & XBBF_READ_ONLY) != 0) { + DPRINTF("Attempt to write to read only device %s\n", + xbb->dev_name); + goto fail_send_response; + } + break; + case BLKIF_OP_READ: + operation = BIO_READ; + req->ds_trans_type = DEVSTAT_READ; + break; + case BLKIF_OP_FLUSH_DISKCACHE: + operation = BIO_FLUSH; + req->ds_tag_type = DEVSTAT_TAG_ORDERED; + req->ds_trans_type = DEVSTAT_NO_DATA; + goto do_dispatch; + /*NOTREACHED*/ + default: + DPRINTF("error: unknown block io operation [%d]\n", + req->operation); + goto fail_send_response; + } /* Check that number of segments is sane. */ - if (unlikely(nseg == 0) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + if (unlikely(nseg == 0) + || unlikely(nseg > xbb->max_request_segments)) { DPRINTF("Bad number of segments in request (%d)\n", nseg); - goto fail_response; + goto fail_send_response; } - if (req->operation == BLKIF_OP_WRITE) { - if (blkif->read_only) { - DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name); - goto fail_response; - } - operation = BIO_WRITE; - } else - operation = BIO_READ; - - pending_req->blkif = blkif; - pending_req->id = req->id; - pending_req->operation = req->operation; - pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; + map = maps; + xbb_sg = xbb->xbb_sgs; + block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); + sg = ring_req->seg; + last_block_sg = sg + block_segs; + seg_idx = 0; + while (1) { - for (i = 0; i < nseg; i++) { - seg[i].nsec = req->seg[i].last_sect - - req->seg[i].first_sect + 1; + while (sg < last_block_sg) { + + xbb_sg->first_sect = sg->first_sect; + xbb_sg->last_sect = sg->last_sect; + xbb_sg->nsect = + (int8_t)(sg->last_sect - sg->first_sect + 1); + + if ((sg->last_sect >= (PAGE_SIZE >> 9)) + || (xbb_sg->nsect <= 0)) + goto fail_send_response; + + nr_sects += xbb_sg->nsect; + map->host_addr = xbb_req_gntaddr(req, seg_idx, + /*sector*/0); + map->flags = GNTMAP_host_map; + map->ref = sg->gref; + map->dom = xbb->otherend_id; + if (operation == BIO_WRITE) + map->flags |= GNTMAP_readonly; + sg++; + map++; + xbb_sg++; + seg_idx++; + } - if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || - (seg[i].nsec <= 0)) - goto fail_response; - nr_sects += seg[i].nsec; + block_segs = MIN(nseg - seg_idx, + BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); + if (block_segs == 0) + break; - map[i].host_addr = vaddr(pending_req, i); - map[i].dom = blkif->domid; - map[i].ref = req->seg[i].gref; - map[i].flags = GNTMAP_host_map; - if (operation == BIO_WRITE) - map[i].flags |= GNTMAP_readonly; + /* + * Fetch the next request block full of SG elements. + * For now, only the spacing between entries is different + * in the different ABIs, not the sg entry layout. + */ + req_ring_idx++; + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, + req_ring_idx); + break; + case BLKIF_PROTOCOL_X86_32: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, + req_ring_idx); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, + req_ring_idx); + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } + last_block_sg = sg + block_segs; } /* Convert to the disk's sector size */ - nr_sects = (nr_sects << 9) >> blkif->sector_size_shift; - - ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); - PANIC_IF(ret); + req->nr_512b_sectors = nr_sects; + nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; + + if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { + device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple " + "of the backing store sector size (%d)\n", + __func__, req->nr_512b_sectors << 9, + xbb->sector_size); + goto fail_send_response; + } - for (i = 0; i < nseg; i++) { - if (unlikely(map[i].status != 0)) { - DPRINTF("invalid buffer -- could not remap it\n"); - goto fail_flush; + error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + maps, req->nr_pages); + if (error != 0) + panic("Grant table operation failed (%d)", error); + + for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) { + + if (unlikely(map->status != 0)) { + DPRINTF("invalid buffer -- could not remap it (%d)\n", + map->status); + DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x " + "ref 0x%x, dom %d\n", seg_idx, + map->host_addr, map->flags, map->ref, + map->dom); + goto fail_unmap_req; } - pending_handle(pending_req, i) = map[i].handle; -#if 0 - /* Can't do this in FreeBSD since vtophys() returns the pfn */ - /* of the remote domain who loaned us the machine page - DPT */ - xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] = - map[i]dev_bus_addr >> PAGE_SHIFT; -#endif - seg[i].buf = map[i].dev_bus_addr | - (req->seg[i].first_sect << 9); + req->gnt_handles[seg_idx] = map->handle; } + if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) { - if (req->sector_number + nr_sects > blkif->media_num_sectors) { - DPRINTF("%s of [%llu,%llu] extends past end of device %s\n", + DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " + "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", - req->sector_number, - req->sector_number + nr_sects, blkif->dev_name); - goto fail_flush; + ring_req->sector_number, + ring_req->sector_number + nr_sects, xbb->dev_name); + goto fail_unmap_req; } - for (i = 0; i < nseg; i++) { - struct bio *bio; - - if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) { - DPRINTF("Misaligned I/O request from domain %d", blkif->domid); - goto fail_put_bio; - } - - bio = biolist[nbio++] = g_new_bio(); - if (unlikely(bio == NULL)) - goto fail_put_bio; +do_dispatch: - bio->bio_cmd = operation; - bio->bio_offset = req->sector_number << blkif->sector_size_shift; - bio->bio_length = seg[i].nsec << 9; - bio->bio_bcount = bio->bio_length; - bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK)); - bio->bio_done = end_block_io_op; - bio->bio_caller2 = pending_req; - bio->bio_dev = blkif->cdev; + error = xbb->dispatch_io(xbb, + ring_req, + req, + nseg, + operation, + bio_flags); - req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift; -#if 0 - printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n", - (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec, - blkif->cdev->si_iosize_max, seg[i].buf); -#endif + if (error != 0) { + if (operation == BIO_FLUSH) + goto fail_send_response; + else + goto fail_unmap_req; } - pending_req->pendcnt = nbio; - blkif_get(blkif); + return; - for (i = 0; i < nbio; i++) - (*blkif->csw->d_strategy)(biolist[i]); - return; +fail_unmap_req: + xbb_unmap_req(req); + /* FALLTHROUGH */ - fail_put_bio: - for (i = 0; i < (nbio-1); i++) - g_destroy_bio(biolist[i]); - fail_flush: - fast_flush_area(pending_req); - fail_response: - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); - free_req(pending_req); +fail_send_response: + xbb_send_response(xbb, req, BLKIF_RSP_ERROR); + xbb_release_req(xbb, req); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/0, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); } +/** + * Process incoming requests from the shared communication ring in response + * to a signal on the ring's event channel. + * + * \param context Callback argument registerd during task initialization - + * the xbb_softc for this instance. + * \param pending The number of taskqueue_enqueue events that have + * occurred since this handler was last run. + */ static void -blk_req_action(void *context, int pending) +xbb_run_queue(void *context, int pending) { - blkif_t *blkif; - - DPRINTF("\n"); - - while (!STAILQ_EMPTY(&req_sched_list)) { - blkif_back_ring_t *blk_ring; - RING_IDX rc, rp; - - blkif = remove_from_req_schedule_list(); - - blk_ring = &blkif->ring; - rc = blk_ring->req_cons; - rp = blk_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { - blkif_request_t *req; - pending_req_t *pending_req; - - pending_req = alloc_req(); - if (pending_req == NULL) - goto out_of_preqs; - - req = RING_GET_REQUEST(blk_ring, rc); - blk_ring->req_cons = ++rc; /* before make_response() */ - - switch (req->operation) { - case BLKIF_OP_READ: - blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); - break; - case BLKIF_OP_WRITE: - blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); - break; - default: - blkif->st_err_req++; - DPRINTF("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, - BLKIF_RSP_ERROR); - free_req(pending_req); - break; - } + struct xbb_softc *xbb; + blkif_back_rings_t *rings; + RING_IDX rp; + + + xbb = (struct xbb_softc *)context; + rings = &xbb->rings; + + /* + * Cache req_prod to avoid accessing a cache line shared + * with the frontend. + */ + rp = rings->common.sring->req_prod; + + /* Ensure we see queued requests up to 'rp'. */ + rmb(); + + /** + * Run so long as there is work to consume and the generation + * of a response will not overflow the ring. + * + * @note There's a 1 to 1 relationship between requests and responses, + * so an overflow should never occur. This test is to protect + * our domain from digesting bogus data. Shouldn't we log this? + */ + while (rings->common.req_cons != rp + && RING_REQUEST_CONS_OVERFLOW(&rings->common, + rings->common.req_cons) == 0) { + blkif_request_t ring_req_storage; + blkif_request_t *ring_req; + struct xbb_xen_req *req; + RING_IDX req_ring_idx; + + req = xbb_get_req(xbb); + if (req == NULL) { + /* + * Resource shortage has been recorded. + * We'll be scheduled to run once a request + * object frees up due to a completion. + */ + break; } - blkif_put(blkif); - } + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + ring_req = RING_GET_REQUEST(&xbb->rings.native, + rings->common.req_cons); + break; + case BLKIF_PROTOCOL_X86_32: + { + struct blkif_x86_32_request *ring_req32; + + ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32, + rings->common.req_cons); + blkif_get_x86_32_req(&ring_req_storage, ring_req32); + ring_req = &ring_req_storage; + break; + } + case BLKIF_PROTOCOL_X86_64: + { + struct blkif_x86_64_request *ring_req64; + + ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64, + rings->common.req_cons); + blkif_get_x86_64_req(&ring_req_storage, ring_req64); + ring_req = &ring_req_storage; + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } - return; + /* + * Signify that we can overwrite this request with a + * response by incrementing our consumer index. The + * response won't be generated until after we've already + * consumed all necessary data out of the version of the + * request in the ring buffer (for native mode). We + * must update the consumer index before issueing back-end + * I/O so there is no possibility that it will complete + * and a response be generated before we make room in + * the queue for that response. + */ + req_ring_idx = xbb->rings.common.req_cons; + xbb->rings.common.req_cons += + BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); - out_of_preqs: - /* We ran out of pending req structs */ - /* Just requeue interface and wait to be rescheduled to run when one is freed */ - add_to_req_schedule_list_tail2(blkif); - blkif->st_oo_req++; + xbb_dispatch_io(xbb, ring_req, req, req_ring_idx); + } } -/* Handle interrupt from a frontend */ +/** + * Interrupt handler bound to the shared ring's event channel. + * + * \param arg Callback argument registerd during event channel + * binding - the xbb_softc for this instance. + */ static void -blkback_intr(void *arg) +xbb_intr(void *arg) { - blkif_t *blkif = arg; - DPRINTF("%x\n", (unsigned int)blkif); - add_to_req_schedule_list_tail(blkif); + struct xbb_softc *xbb; + + /* Defer to kernel thread. */ + xbb = (struct xbb_softc *)arg; + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } -/* Map grant ref for ring */ +/*----------------------------- Backend Handlers -----------------------------*/ +/** + * Backend handler for character device access. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param nseg The number of valid segments for this request in + * xbb->xbb_sgs. + * \param operation BIO_* I/O operation code. + * \param bio_flags Additional bio_flag data to pass to any generated + * bios (e.g. BIO_ORDERED).. + * + * \return 0 for success, errno codes for failure. + */ static int -map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, int operation, + int bio_flags) { - struct gnttab_map_grant_ref op; + struct xbb_dev_data *dev_data; + struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST]; + off_t bio_offset; + struct bio *bio; + struct xbb_sg *xbb_sg; + u_int nbio; + u_int bio_idx; + u_int seg_idx; + int error; + + dev_data = &xbb->backend.dev; + bio_offset = (off_t)ring_req->sector_number + << xbb->sector_size_shift; + error = 0; + nbio = 0; + bio_idx = 0; + + if (operation == BIO_FLUSH) { + bio = g_new_bio(); + if (unlikely(bio == NULL)) { + DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); + error = ENOMEM; + return (error); + } + + bio->bio_cmd = BIO_FLUSH; + bio->bio_flags |= BIO_ORDERED; + bio->bio_dev = dev_data->cdev; + bio->bio_offset = 0; + bio->bio_data = 0; + bio->bio_done = xbb_bio_done; + bio->bio_caller1 = req; + bio->bio_pblkno = 0; - ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); - if (ring->va == 0) - return ENOMEM; + req->pendcnt = 1; - op.host_addr = ring->va; - op.flags = GNTMAP_host_map; - op.ref = ref; - op.dom = dom; - HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); - if (op.status) { - WPRINTF("grant table op err=%d\n", op.status); - kmem_free(kernel_map, ring->va, PAGE_SIZE); - ring->va = 0; - return EACCES; + (*dev_data->csw->d_strategy)(bios[bio_idx]); + + return (0); } - ring->handle = op.handle; - ring->bus_addr = op.dev_bus_addr; + for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs; + seg_idx < nseg; + seg_idx++, xbb_sg++) { - return 0; -} + /* + * KVA will not be contiguous, so any additional + * I/O will need to be represented in a new bio. + */ + if ((bio != NULL) + && (xbb_sg->first_sect != 0)) { + if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { + printf("%s: Discontiguous I/O request from " + "domain %d ends on non-sector " + "boundary\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } + bio = NULL; + } -/* Unmap grant ref for ring */ -static void -unmap_ring(struct ring_ref *ring) -{ - struct gnttab_unmap_grant_ref op; + if (bio == NULL) { + /* + * Make sure that the start of this bio is aligned + * to a device sector. + */ + if ((bio_offset & (xbb->sector_size - 1)) != 0) { + printf("%s: Misaligned I/O request from " + "domain %d\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } - op.host_addr = ring->va; - op.dev_bus_addr = ring->bus_addr; - op.handle = ring->handle; - HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); - if (op.status) - WPRINTF("grant table op err=%d\n", op.status); + bio = bios[nbio++] = g_new_bio(); + if (unlikely(bio == NULL)) { + error = ENOMEM; + goto fail_free_bios; + } + bio->bio_cmd = operation; + bio->bio_flags |= bio_flags; + bio->bio_dev = dev_data->cdev; + bio->bio_offset = bio_offset; + bio->bio_data = xbb_req_ioaddr(req, seg_idx, + xbb_sg->first_sect); + bio->bio_done = xbb_bio_done; + bio->bio_caller1 = req; + bio->bio_pblkno = bio_offset + >> xbb->sector_size_shift; + } - kmem_free(kernel_map, ring->va, PAGE_SIZE); - ring->va = 0; -} + bio->bio_length += xbb_sg->nsect << 9; + bio->bio_bcount = bio->bio_length; + bio_offset += xbb_sg->nsect << 9; -static int -connect_ring(blkif_t *blkif) -{ - struct xenbus_device *xdev = blkif->xdev; - blkif_sring_t *ring; - unsigned long ring_ref; - evtchn_port_t evtchn; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; - - if (blkif->ring_connected) - return 0; - - // Grab FE data and map his memory - err = xenbus_gather(NULL, xdev->otherend, - "ring-ref", "%lu", &ring_ref, - "event-channel", "%u", &evtchn, NULL); - if (err) { - xenbus_dev_fatal(xdev, err, - "reading %s/ring-ref and event-channel", - xdev->otherend); - return err; - } - - err = map_ring(ring_ref, blkif->domid, &blkif->rr); - if (err) { - xenbus_dev_fatal(xdev, err, "mapping ring"); - return err; - } - ring = (blkif_sring_t *)blkif->rr.va; - BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE); - - op.u.bind_interdomain.remote_dom = blkif->domid; - op.u.bind_interdomain.remote_port = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_ring(&blkif->rr); - xenbus_dev_fatal(xdev, err, "binding event channel"); - return err; - } - blkif->evtchn = op.u.bind_interdomain.local_port; - - /* bind evtchn to irq handler */ - blkif->irq = - bind_evtchn_to_irqhandler(blkif->evtchn, "blkback", - blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie); - - blkif->ring_connected = 1; - - DPRINTF("%x rings connected! evtchn=%d irq=%d\n", - (unsigned int)blkif, blkif->evtchn, blkif->irq); + if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { - return 0; -} + if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { + printf("%s: Discontiguous I/O request from " + "domain %d ends on non-sector " + "boundary\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } + /* + * KVA will not be contiguous, so any additional + * I/O will need to be represented in a new bio. + */ + bio = NULL; + } + } -static void -disconnect_ring(blkif_t *blkif) -{ - DPRINTF("\n"); + req->pendcnt = nbio; + + for (bio_idx = 0; bio_idx < nbio; bio_idx++) + { +#ifdef XBB_USE_BOUNCE_BUFFERS + vm_offset_t kva_offset; - if (blkif->ring_connected) { - unbind_from_irqhandler(blkif->irq, blkif->irq_cookie); - blkif->irq = 0; - unmap_ring(&blkif->rr); - blkif->ring_connected = 0; + kva_offset = (vm_offset_t)bios[bio_idx]->bio_data + - (vm_offset_t)req->bounce; + if (operation == BIO_WRITE) { + memcpy(bios[bio_idx]->bio_data, + (uint8_t *)req->kva + kva_offset, + bios[bio_idx]->bio_bcount); + } +#endif + (*dev_data->csw->d_strategy)(bios[bio_idx]); } + + return (error); + +fail_free_bios: + for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) + g_destroy_bio(bios[bio_idx]); + + return (error); } -static void -connect(blkif_t *blkif) +/** + * Backend handler for file access. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param nseg The number of valid segments for this request in + * xbb->xbb_sgs. + * \param operation BIO_* I/O operation code. + * \param bio_flags Additional bio_flag data to pass to any generated bios + * (e.g. BIO_ORDERED).. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, int operation, + int flags) { - struct xenbus_transaction *xbt; - struct xenbus_device *xdev = blkif->xdev; - int err; + struct xbb_file_data *file_data; + u_int seg_idx; + struct uio xuio; + struct xbb_sg *xbb_sg; + struct iovec *xiovec; +#ifdef XBB_USE_BOUNCE_BUFFERS + void **p_vaddr; + int saved_uio_iovcnt; +#endif /* XBB_USE_BOUNCE_BUFFERS */ + int vfs_is_locked; + int error; + + file_data = &xbb->backend.file; + error = 0; + bzero(&xuio, sizeof(xuio)); + + req->pendcnt = 0; + + switch (operation) { + case BIO_READ: + xuio.uio_rw = UIO_READ; + break; + case BIO_WRITE: + xuio.uio_rw = UIO_WRITE; + break; + case BIO_FLUSH: { + struct mount *mountpoint; - if (!blkif->ring_connected || - blkif->vn == NULL || - blkif->state == XenbusStateConnected) - return; + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); - DPRINTF("%s\n", xdev->otherend); + (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); - /* Supply the information about the device the frontend needs */ -again: - xbt = xenbus_transaction_start(); - if (IS_ERR(xbt)) { - xenbus_dev_fatal(xdev, PTR_ERR(xbt), - "Error writing configuration for backend " - "(start transaction)"); - return; - } + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); + VOP_UNLOCK(xbb->vn, 0); - err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u", - blkif->media_num_sectors); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/sectors", - xdev->nodename); - goto abort; - } + vn_finished_write(mountpoint); + + VFS_UNLOCK_GIANT(vfs_is_locked); - err = xenbus_printf(xbt, xdev->nodename, "info", "%u", - blkif->read_only ? VDISK_READONLY : 0); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/info", - xdev->nodename); - goto abort; + goto bailout_send_response; + /* NOTREACHED */ } - err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u", - blkif->sector_size); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/sector-size", - xdev->nodename); - goto abort; + default: + panic("invalid operation %d", operation); + /* NOTREACHED */ } + xuio.uio_offset = (vm_offset_t)ring_req->sector_number + << xbb->sector_size_shift; - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - if (err) - xenbus_dev_fatal(xdev, err, "ending transaction"); + xuio.uio_segflg = UIO_SYSSPACE; + xuio.uio_iov = file_data->xiovecs; + xuio.uio_iovcnt = 0; - err = xenbus_switch_state(xdev, NULL, XenbusStateConnected); - if (err) - xenbus_dev_fatal(xdev, err, "switching to Connected state", - xdev->nodename); + for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs; + seg_idx < nseg; seg_idx++, xbb_sg++) { - blkif->state = XenbusStateConnected; + /* + * If the first sector is not 0, the KVA will not be + * contiguous and we'll need to go on to another segment. + */ + if (xbb_sg->first_sect != 0) + xiovec = NULL; + + if (xiovec == NULL) { + xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; + xiovec->iov_base = xbb_req_ioaddr(req, seg_idx, + xbb_sg->first_sect); +#ifdef XBB_USE_BOUNCE_BUFFERS + /* + * Store the address of the incoming buffer at this + * particular offset as well, so we can do the copy + * later without having to do more work to + * recalculate this address. + */ + p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; + *p_vaddr = xbb_req_vaddr(req, seg_idx, + xbb_sg->first_sect); +#endif /* XBB_USE_BOUNCE_BUFFERS */ + xiovec->iov_len = 0; + xuio.uio_iovcnt++; + } - return; + xiovec->iov_len += xbb_sg->nsect << 9; - abort: - xenbus_transaction_end(xbt, 1); -} + xuio.uio_resid += xbb_sg->nsect << 9; -static int -blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) -{ - int err; - char *p, *mode = NULL, *type = NULL, *params = NULL; - long handle; + /* + * If the last sector is not the full page size count, + * the next segment will not be contiguous in KVA and we + * need a new iovec. + */ + if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) + xiovec = NULL; + } - DPRINTF("node=%s\n", xdev->nodename); + xuio.uio_td = curthread; - p = strrchr(xdev->otherend, '/') + 1; - handle = strtoul(p, NULL, 0); +#ifdef XBB_USE_BOUNCE_BUFFERS + saved_uio_iovcnt = xuio.uio_iovcnt; - mode = xenbus_read(NULL, xdev->nodename, "mode", NULL); - if (IS_ERR(mode)) { - xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode"); - err = PTR_ERR(mode); - goto error; - } - - type = xenbus_read(NULL, xdev->nodename, "type", NULL); - if (IS_ERR(type)) { - xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type"); - err = PTR_ERR(type); - goto error; - } - - params = xenbus_read(NULL, xdev->nodename, "params", NULL); - if (IS_ERR(type)) { - xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params"); - err = PTR_ERR(params); - goto error; - } - - err = blkif_create(xdev, handle, mode, type, params); - if (err) { - xenbus_dev_fatal(xdev, err, "creating blkif"); - goto error; - } + if (operation == BIO_WRITE) { + /* Copy the write data to the local buffer. */ + for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, + xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; + seg_idx++, xiovec++, p_vaddr++) { - err = vbd_add_dev(xdev); - if (err) { - blkif_put((blkif_t *)xdev->data); - xenbus_dev_fatal(xdev, err, "adding vbd device"); + memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); + } + } else { + /* + * We only need to save off the iovecs in the case of a + * read, because the copy for the read happens after the + * VOP_READ(). (The uio will get modified in that call + * sequence.) + */ + memcpy(file_data->saved_xiovecs, xuio.uio_iov, + xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); } +#endif /* XBB_USE_BOUNCE_BUFFERS */ - return err; + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); + switch (operation) { + case BIO_READ: - error: - if (mode) - free(mode, M_DEVBUF); - if (type) - free(type, M_DEVBUF); - if (params) - free(params, M_DEVBUF); - return err; -} + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); -static int -blkback_remove(struct xenbus_device *xdev) -{ - blkif_t *blkif = xdev->data; - device_t ndev; + /* + * UFS pays attention to IO_DIRECT for reads. If the + * DIRECTIO option is configured into the kernel, it calls + * ffs_rawread(). But that only works for single-segment + * uios with user space addresses. In our case, with a + * kernel uio, it still reads into the buffer cache, but it + * will just try to release the buffer from the cache later + * on in ffs_read(). + * + * ZFS does not pay attention to IO_DIRECT for reads. + * + * UFS does not pay attention to IO_SYNC for reads. + * + * ZFS pays attention to IO_SYNC (which translates into the + * Solaris define FRSYNC for zfs_read()) for reads. It + * attempts to sync the file before reading. + * + * So, to attempt to provide some barrier semantics in the + * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. + */ + error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? + (IO_DIRECT|IO_SYNC) : 0, file_data->cred); - DPRINTF("node=%s\n", xdev->nodename); + VOP_UNLOCK(xbb->vn, 0); + break; + case BIO_WRITE: { + struct mount *mountpoint; - blkif->state = XenbusStateClosing; + (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); - if ((ndev = blkif->ndev)) { - blkif->ndev = NULL; - mtx_lock(&Giant); - device_detach(ndev); - mtx_unlock(&Giant); - } + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); - xdev->data = NULL; - blkif->xdev = NULL; - blkif_put(blkif); + /* + * UFS pays attention to IO_DIRECT for writes. The write + * is done asynchronously. (Normally the write would just + * get put into cache. + * + * UFS pays attention to IO_SYNC for writes. It will + * attempt to write the buffer out synchronously if that + * flag is set. + * + * ZFS does not pay attention to IO_DIRECT for writes. + * + * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) + * for writes. It will flush the transaction from the + * cache before returning. + * + * So if we've got the BIO_ORDERED flag set, we want + * IO_SYNC in either the UFS or ZFS case. + */ + error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? + IO_SYNC : 0, file_data->cred); + VOP_UNLOCK(xbb->vn, 0); - return 0; -} + vn_finished_write(mountpoint); -static int -blkback_resume(struct xenbus_device *xdev) -{ - DPRINTF("node=%s\n", xdev->nodename); - return 0; + break; + } + default: + panic("invalid operation %d", operation); + /* NOTREACHED */ + } + VFS_UNLOCK_GIANT(vfs_is_locked); + +#ifdef XBB_USE_BOUNCE_BUFFERS + /* We only need to copy here for read operations */ + if (operation == BIO_READ) { + + for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, + xiovec = file_data->saved_xiovecs; + seg_idx < saved_uio_iovcnt; seg_idx++, + xiovec++, p_vaddr++) { + + /* + * Note that we have to use the copy of the + * io vector we made above. uiomove() modifies + * the uio and its referenced vector as uiomove + * performs the copy, so we can't rely on any + * state from the original uio. + */ + memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); + } + } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + +bailout_send_response: + + /* + * All I/O is already done, send the response. A lock is not + * necessary here because we're single threaded, and therefore the + * only context accessing this request right now. If that changes, + * we may need some locking here. + */ + xbb_unmap_req(req); + xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY : + BLKIF_RSP_ERROR); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/error == 0 ? req->nr_512b_sectors << 9 + : 0, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); + xbb_release_req(xbb, req); + + return (0); } +/*--------------------------- Backend Configuration --------------------------*/ +/** + * Close and cleanup any backend device/file specific state for this + * block back instance. + * + * \param xbb Per-instance xbb configuration structure. + */ static void -frontend_changed(struct xenbus_device *xdev, - XenbusState frontend_state) +xbb_close_backend(struct xbb_softc *xbb) { - blkif_t *blkif = xdev->data; + DROP_GIANT(); + DPRINTF("closing dev=%s\n", xbb->dev_name); + if (xbb->vn) { + int flags = FREAD; + int vfs_is_locked = 0; - DPRINTF("state=%d\n", frontend_state); + if ((xbb->flags & XBBF_READ_ONLY) == 0) + flags |= FWRITE; - blkif->frontend_state = frontend_state; + switch (xbb->device_type) { + case XBB_TYPE_DISK: + if (xbb->backend.dev.csw) { + dev_relthread(xbb->backend.dev.cdev, + xbb->backend.dev.dev_ref); + xbb->backend.dev.csw = NULL; + xbb->backend.dev.cdev = NULL; + } + break; + case XBB_TYPE_FILE: + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); + break; + case XBB_TYPE_NONE: + default: + panic("Unexpected backend type."); + break; + } - switch (frontend_state) { - case XenbusStateInitialising: - break; - case XenbusStateInitialised: - case XenbusStateConnected: - connect_ring(blkif); - connect(blkif); - break; - case XenbusStateClosing: - xenbus_switch_state(xdev, NULL, XenbusStateClosing); - break; - case XenbusStateClosed: - xenbus_remove_device(xdev); - break; - case XenbusStateUnknown: - case XenbusStateInitWait: - xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", - frontend_state); - break; + (void)vn_close(xbb->vn, flags, NOCRED, curthread); + xbb->vn = NULL; + + switch (xbb->device_type) { + case XBB_TYPE_DISK: + break; + case XBB_TYPE_FILE: + VFS_UNLOCK_GIANT(vfs_is_locked); + if (xbb->backend.file.cred != NULL) { + crfree(xbb->backend.file.cred); + xbb->backend.file.cred = NULL; + } + break; + case XBB_TYPE_NONE: + default: + panic("Unexpected backend type."); + break; + } } + PICKUP_GIANT(); } -/* ** Driver registration ** */ - -static struct xenbus_device_id blkback_ids[] = { - { "vbd" }, - { "" } -}; +/** + * Open a character device to be used for backend I/O. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_open_dev(struct xbb_softc *xbb) +{ + struct vattr vattr; + struct cdev *dev; + struct cdevsw *devsw; + int error; + + xbb->device_type = XBB_TYPE_DISK; + xbb->dispatch_io = xbb_dispatch_dev; + xbb->backend.dev.cdev = xbb->vn->v_rdev; + xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, + &xbb->backend.dev.dev_ref); + if (xbb->backend.dev.csw == NULL) + panic("Unable to retrieve device switch"); + + error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); + if (error) { + xenbus_dev_fatal(xbb->dev, error, "error getting " + "vnode attributes for device %s", + xbb->dev_name); + return (error); + } -static struct xenbus_driver blkback = { - .name = "blkback", - .ids = blkback_ids, - .probe = blkback_probe, - .remove = blkback_remove, - .resume = blkback_resume, - .otherend_changed = frontend_changed, -}; -static void -blkback_init(void *unused) -{ - int i; - - TASK_INIT(&blk_req_task, 0, blk_req_action, NULL); - mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF); - - mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF); - - mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - pending_reqs = malloc(sizeof(pending_reqs[0]) * - blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT); - pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) * - mmap_pages, M_DEVBUF, M_NOWAIT); - pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) * - mmap_pages, M_DEVBUF, M_NOWAIT); - mmap_vstart = alloc_empty_page_range(mmap_pages); - if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) { - if (pending_reqs) - free(pending_reqs, M_DEVBUF); - if (pending_grant_handles) - free(pending_grant_handles, M_DEVBUF); - if (pending_vaddrs) - free(pending_vaddrs, M_DEVBUF); - WPRINTF("out of memory\n"); - return; + dev = xbb->vn->v_rdev; + devsw = dev->si_devsw; + if (!devsw->d_ioctl) { + xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " + "device %s!", xbb->dev_name); + return (ENODEV); } - for (i = 0; i < mmap_pages; i++) { - pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); - pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, + (caddr_t)&xbb->sector_size, FREAD, + curthread); + if (error) { + xenbus_dev_fatal(xbb->dev, error, + "error calling ioctl DIOCGSECTORSIZE " + "for device %s", xbb->dev_name); + return (error); } - for (i = 0; i < blkif_reqs; i++) { - STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list); + error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, + (caddr_t)&xbb->media_size, FREAD, + curthread); + if (error) { + xenbus_dev_fatal(xbb->dev, error, + "error calling ioctl DIOCGMEDIASIZE " + "for device %s", xbb->dev_name); + return (error); } - DPRINTF("registering %s\n", blkback.name); - xenbus_register_backend(&blkback); + return (0); } -SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL) - -static void -close_device(blkif_t *blkif) +/** + * Open a file to be used for backend I/O. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_open_file(struct xbb_softc *xbb) { - DPRINTF("closing dev=%s\n", blkif->dev_name); - if (blkif->vn) { - int flags = FREAD; - - if (!blkif->read_only) - flags |= FWRITE; + struct xbb_file_data *file_data; + struct vattr vattr; + int error; + + file_data = &xbb->backend.file; + xbb->device_type = XBB_TYPE_FILE; + xbb->dispatch_io = xbb_dispatch_file; + error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "error calling VOP_GETATTR()" + "for file %s", xbb->dev_name); + return (error); + } - if (blkif->csw) { - dev_relthread(blkif->cdev); - blkif->csw = NULL; + /* + * Verify that we have the ability to upgrade to exclusive + * access on this file so we can trap errors at open instead + * of reporting them during first access. + */ + if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { + vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); + if (xbb->vn->v_iflag & VI_DOOMED) { + error = EBADF; + xenbus_dev_fatal(xbb->dev, error, + "error locking file %s", + xbb->dev_name); + + return (error); } + } - (void)vn_close(blkif->vn, flags, NOCRED, curthread); - blkif->vn = NULL; + file_data->cred = crhold(curthread->td_ucred); + xbb->media_size = vattr.va_size; + + /* + * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. + * With ZFS, it is 131072 bytes. Block sizes that large don't work + * with disklabel and UFS on FreeBSD at least. Large block sizes + * may not work with other OSes as well. So just export a sector + * size of 512 bytes, which should work with any OS or + * application. Since our backing is a file, any block size will + * work fine for the backing store. + */ +#if 0 + xbb->sector_size = vattr.va_blocksize; +#endif + xbb->sector_size = 512; + + /* + * Sanity check. The media size has to be at least one + * sector long. + */ + if (xbb->media_size < xbb->sector_size) { + error = EINVAL; + xenbus_dev_fatal(xbb->dev, error, + "file %s size %ju < block size %u", + xbb->dev_name, + (uintmax_t)xbb->media_size, + xbb->sector_size); } + return (error); } +/** + * Open the backend provider for this connection. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ static int -open_device(blkif_t *blkif) +xbb_open_backend(struct xbb_softc *xbb) { struct nameidata nd; - struct vattr vattr; - struct cdev *dev; - struct cdevsw *devsw; - int flags = FREAD, err = 0; + int flags; + int error; + int vfs_is_locked; - DPRINTF("opening dev=%s\n", blkif->dev_name); + flags = FREAD; + error = 0; - if (!blkif->read_only) + DPRINTF("opening dev=%s\n", xbb->dev_name); + + if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; if (!curthread->td_proc->p_fd->fd_cdir) { @@ -1066,284 +1930,1045 @@ open_device(blkif_t *blkif) } again: - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread); - err = vn_open(&nd, &flags, 0, -1); - if (err) { - if (blkif->dev_name[0] != '/') { + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); + error = vn_open(&nd, &flags, 0, NULL); + if (error) { + /* + * This is the only reasonable guess we can make as far as + * path if the user doesn't give us a fully qualified path. + * If they want to specify a file, they need to specify the + * full path. + */ + if (xbb->dev_name[0] != '/') { char *dev_path = "/dev/"; char *dev_name; /* Try adding device path at beginning of name */ - dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT); + dev_name = malloc(strlen(xbb->dev_name) + + strlen(dev_path) + 1, + M_XENBLOCKBACK, M_NOWAIT); if (dev_name) { - sprintf(dev_name, "%s%s", dev_path, blkif->dev_name); - free(blkif->dev_name, M_DEVBUF); - blkif->dev_name = dev_name; + sprintf(dev_name, "%s%s", dev_path, + xbb->dev_name); + free(xbb->dev_name, M_XENBLOCKBACK); + xbb->dev_name = dev_name; goto again; } } - xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name); - return err; + xenbus_dev_fatal(xbb->dev, error, "error opening device %s", + xbb->dev_name); + return (error); } + + vfs_is_locked = NDHASGIANT(&nd); + NDFREE(&nd, NDF_ONLY_PNBUF); - blkif->vn = nd.ni_vp; + xbb->vn = nd.ni_vp; + + /* We only support disks and files. */ + if (vn_isdisk(xbb->vn, &error)) { + error = xbb_open_dev(xbb); + } else if (xbb->vn->v_type == VREG) { + error = xbb_open_file(xbb); + } else { + error = EINVAL; + xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " + "or file", xbb->dev_name); + } + VOP_UNLOCK(xbb->vn, 0); + VFS_UNLOCK_GIANT(vfs_is_locked); - /* We only support disks for now */ - if (!vn_isdisk(blkif->vn, &err)) { - xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name); - VOP_UNLOCK(blkif->vn, 0, curthread); - goto error; + if (error != 0) { + xbb_close_backend(xbb); + return (error); } - blkif->cdev = blkif->vn->v_rdev; - blkif->csw = dev_refthread(blkif->cdev); - PANIC_IF(blkif->csw == NULL); + xbb->sector_size_shift = fls(xbb->sector_size) - 1; + xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; + + DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", + (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", + xbb->dev_name, xbb->sector_size, xbb->media_size); + + return (0); +} - err = VOP_GETATTR(blkif->vn, &vattr, NOCRED); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error getting vnode attributes for device %s", blkif->dev_name); - VOP_UNLOCK(blkif->vn, 0, curthread); - goto error; +/*------------------------ Inter-Domain Communication ------------------------*/ +/** + * Cleanup all inter-domain communication mechanisms. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_disconnect(struct xbb_softc *xbb) +{ + struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref *op; + u_int ring_idx; + int error; + + DPRINTF("\n"); + + if ((xbb->flags & XBBF_RING_CONNECTED) == 0) + return; + + if (xbb->irq != 0) { + unbind_from_irqhandler(xbb->irq); + xbb->irq = 0; } - VOP_UNLOCK(blkif->vn, 0, curthread); + for (ring_idx = 0, op = ops; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, op++) { - dev = blkif->vn->v_rdev; - devsw = dev->si_devsw; - if (!devsw->d_ioctl) { - err = ENODEV; - xenbus_dev_fatal(blkif->xdev, err, - "no d_ioctl for device %s!", blkif->dev_name); - goto error; + op->host_addr = xbb->ring_config.gnt_addr + + (ring_idx * PAGE_SIZE); + op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; + op->handle = xbb->ring_config.handle[ring_idx]; } - err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name); - goto error; + error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, + xbb->ring_config.ring_pages); + if (error != 0) + panic("Grant table op failed (%d)", error); + + xbb->flags &= ~XBBF_RING_CONNECTED; +} + +/** + * Map shared memory ring into domain local address space, initialize + * ring control structures, and bind an interrupt to the event channel + * used to notify us of ring changes. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_connect_ring(struct xbb_softc *xbb) +{ + struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; + struct gnttab_map_grant_ref *gnt; + u_int ring_idx; + int error; + + if ((xbb->flags & XBBF_RING_CONNECTED) != 0) + return (0); + + /* + * Kva for our ring is at the tail of the region of kva allocated + * by xbb_alloc_communication_mem(). + */ + xbb->ring_config.va = xbb->kva + + (xbb->kva_size + - (xbb->ring_config.ring_pages * PAGE_SIZE)); + xbb->ring_config.gnt_addr = xbb->gnt_base_addr + + (xbb->kva_size + - (xbb->ring_config.ring_pages * PAGE_SIZE)); + + for (ring_idx = 0, gnt = gnts; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, gnt++) { + + gnt->host_addr = xbb->ring_config.gnt_addr + + (ring_idx * PAGE_SIZE); + gnt->flags = GNTMAP_host_map; + gnt->ref = xbb->ring_config.ring_ref[ring_idx]; + gnt->dom = xbb->otherend_id; } - blkif->sector_size_shift = fls(blkif->sector_size) - 1; - err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name); - goto error; + error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, + xbb->ring_config.ring_pages); + if (error) + panic("blkback: Ring page grant table op failed (%d)", error); + + for (ring_idx = 0, gnt = gnts; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, gnt++) { + if (gnt->status != 0) { + xbb->ring_config.va = 0; + xenbus_dev_fatal(xbb->dev, EACCES, + "Ring shared page mapping failed. " + "Status %d.", gnt->status); + return (EACCES); + } + xbb->ring_config.handle[ring_idx] = gnt->handle; + xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; } - blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift; - blkif->major = major(vattr.va_rdev); - blkif->minor = minor(vattr.va_rdev); + /* Initialize the ring based on ABI. */ + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.native, sring, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + default: + panic("Unexpected blkif protocol ABI."); + } - DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n", - blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size); + xbb->flags |= XBBF_RING_CONNECTED; + + error = + bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, + xbb->ring_config.evtchn, + device_get_nameunit(xbb->dev), + xbb_intr, /*arg*/xbb, + INTR_TYPE_BIO | INTR_MPSAFE, + &xbb->irq); + if (error) { + xbb_disconnect(xbb); + xenbus_dev_fatal(xbb->dev, error, "binding event channel"); + return (error); + } - return 0; + DPRINTF("rings connected!\n"); - error: - close_device(blkif); - return err; + return 0; } +/** + * Size KVA and pseudo-physical address allocations based on negotiated + * values for the size and number of I/O requests, and the size of our + * communication ring. + * + * \param xbb Per-instance xbb configuration structure. + * + * These address spaces are used to dynamically map pages in the + * front-end's domain into our own. + */ static int -vbd_add_dev(struct xenbus_device *xdev) +xbb_alloc_communication_mem(struct xbb_softc *xbb) { - blkif_t *blkif = xdev->data; - device_t nexus, ndev; - devclass_t dc; - int err = 0; + xbb->kva_size = (xbb->ring_config.ring_pages + + (xbb->max_requests * xbb->max_request_segments)) + * PAGE_SIZE; +#ifndef XENHVM + xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); + if (xbb->kva == 0) + return (ENOMEM); + xbb->gnt_base_addr = xbb->kva; +#else /* XENHVM */ + /* + * Reserve a range of pseudo physical memory that we can map + * into kva. These pages will only be backed by machine + * pages ("real memory") during the lifetime of front-end requests + * via grant table operations. + */ + xbb->pseudo_phys_res_id = 0; + xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, + &xbb->pseudo_phys_res_id, + 0, ~0, xbb->kva_size, + RF_ACTIVE); + if (xbb->pseudo_phys_res == NULL) { + xbb->kva = 0; + return (ENOMEM); + } + xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); + xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); +#endif /* XENHVM */ + return (0); +} - mtx_lock(&Giant); +/** + * Free dynamically allocated KVA or pseudo-physical address allocations. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_free_communication_mem(struct xbb_softc *xbb) +{ + if (xbb->kva != 0) { +#ifndef XENHVM + kmem_free(kernel_map, xbb->kva, xbb->kva_size); +#else + if (xbb->pseudo_phys_res != NULL) { + bus_release_resource(xbb->dev, SYS_RES_MEMORY, + xbb->pseudo_phys_res_id, + xbb->pseudo_phys_res); + xbb->pseudo_phys_res = NULL; + } +#endif + } + xbb->kva = 0; + xbb->gnt_base_addr = 0; +} - /* We will add a vbd device as a child of nexus0 (for now) */ - if (!(dc = devclass_find("nexus")) || - !(nexus = devclass_get_device(dc, 0))) { - WPRINTF("could not find nexus0!\n"); - err = ENOENT; - goto done; +/** + * Collect front-end information from the XenStore. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_collect_frontend_info(struct xbb_softc *xbb) +{ + char protocol_abi[64]; + const char *otherend_path; + int error; + u_int ring_idx; + + otherend_path = xenbus_get_otherend_path(xbb->dev); + + /* + * Mandatory data (used in all versions of the protocol) first. + */ + error = xs_gather(XST_NIL, otherend_path, + "ring-ref", "%" PRIu32, + &xbb->ring_config.ring_ref[0], + "event-channel", "%" PRIu32, + &xbb->ring_config.evtchn, + NULL); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Unable to retrieve ring information from " + "frontend %s. Unable to connect.", + xenbus_get_otherend_path(xbb->dev)); + return (error); } + /* + * These fields are initialized to legacy protocol defaults + * so we only need to fail if reading the updated value succeeds + * and the new value is outside of its allowed range. + * + * \note xs_gather() returns on the first encountered error, so + * we must use independant calls in order to guarantee + * we don't miss information in a sparsly populated front-end + * tree. + */ + (void)xs_scanf(XST_NIL, otherend_path, + "ring-pages", NULL, "%" PRIu32, + &xbb->ring_config.ring_pages); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-requests", NULL, "%" PRIu32, + &xbb->max_requests); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-segments", NULL, "%" PRIu32, + &xbb->max_request_segments); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-size", NULL, "%" PRIu32, + &xbb->max_request_size); + + if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed ring-pages of %u " + "exceeds backend limit of %zu. " + "Unable to connect.", + xbb->ring_config.ring_pages, + XBB_MAX_RING_PAGES); + return (EINVAL); + } else if (xbb->max_requests > XBB_MAX_REQUESTS) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_requests of %u " + "exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_requests, + XBB_MAX_REQUESTS); + return (EINVAL); + } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_requests_segments " + "of %u exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_request_segments, + XBB_MAX_SEGMENTS_PER_REQUEST); + return (EINVAL); + } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_request_size " + "of %u exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_request_size, + XBB_MAX_REQUEST_SIZE); + return (EINVAL); + } - /* Create a newbus device representing the vbd */ - ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle); - if (!ndev) { - WPRINTF("could not create newbus device vbd%d!\n", blkif->handle); - err = EFAULT; - goto done; + /* If using a multi-page ring, pull in the remaining references. */ + for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { + char ring_ref_name[]= "ring_refXX"; + + snprintf(ring_ref_name, sizeof(ring_ref_name), + "ring-ref%u", ring_idx); + error = xs_scanf(XST_NIL, otherend_path, + ring_ref_name, NULL, "%" PRIu32, + &xbb->ring_config.ring_ref[ring_idx]); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Failed to retriev grant reference " + "for page %u of shared ring. Unable " + "to connect.", ring_idx); + return (error); + } } - - blkif_get(blkif); - device_set_ivars(ndev, blkif); - blkif->ndev = ndev; - device_probe_and_attach(ndev); + error = xs_gather(XST_NIL, otherend_path, + "protocol", "%63s", protocol_abi, + NULL); + if (error != 0 + || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { + /* + * Assume native if the frontend has not + * published ABI data or it has published and + * matches our own ABI. + */ + xbb->abi = BLKIF_PROTOCOL_NATIVE; + } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { - done: + xbb->abi = BLKIF_PROTOCOL_X86_32; + } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { - mtx_unlock(&Giant); + xbb->abi = BLKIF_PROTOCOL_X86_64; + } else { - return err; + xenbus_dev_fatal(xbb->dev, EINVAL, + "Unknown protocol ABI (%s) published by " + "frontend. Unable to connect.", protocol_abi); + return (EINVAL); + } + return (0); } -enum { - VBD_SYSCTL_DOMID, - VBD_SYSCTL_ST_RD_REQ, - VBD_SYSCTL_ST_WR_REQ, - VBD_SYSCTL_ST_OO_REQ, - VBD_SYSCTL_ST_ERR_REQ, - VBD_SYSCTL_RING, -}; - -static char * -vbd_sysctl_ring_info(blkif_t *blkif, int cmd) +/** + * Allocate per-request data structures given request size and number + * information negotiated with the front-end. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_alloc_requests(struct xbb_softc *xbb) { - char *buf = malloc(256, M_DEVBUF, M_WAITOK); - if (buf) { - if (!blkif->ring_connected) - sprintf(buf, "ring not connected\n"); - else { - blkif_back_ring_t *ring = &blkif->ring; - sprintf(buf, "nr_ents=%x req_cons=%x" - " req_prod=%x req_event=%x" - " rsp_prod=%x rsp_event=%x", - ring->nr_ents, ring->req_cons, - ring->sring->req_prod, ring->sring->req_event, - ring->sring->rsp_prod, ring->sring->rsp_event); + struct xbb_xen_req *req; + struct xbb_xen_req *last_req; + uint8_t *req_kva; + u_long gnt_base; + + /* + * Allocate request book keeping datastructures. + */ + xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), + M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (xbb->requests == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request structures"); + return (ENOMEM); + } + + req_kva = (uint8_t *)xbb->kva; + gnt_base = xbb->gnt_base_addr; + req = xbb->requests; + last_req = &xbb->requests[xbb->max_requests - 1]; + while (req <= last_req) { + int seg; + + req->xbb = xbb; + req->kva = req_kva; + req->gnt_handles = malloc(xbb->max_request_segments + * sizeof(*req->gnt_handles), + M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (req->gnt_handles == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request " + "grant references"); + return (ENOMEM); + } +#ifdef XBB_USE_BOUNCE_BUFFERS + req->bounce = malloc(xbb->max_request_size, + M_XENBLOCKBACK, M_NOWAIT); + if (req->bounce == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request " + "bounce buffers"); + return (ENOMEM); } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + req->gnt_base = gnt_base; + req_kva += xbb->max_request_segments * PAGE_SIZE; + gnt_base += xbb->max_request_segments * PAGE_SIZE; + SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); + + for (seg = 0; seg < xbb->max_request_segments; seg++) + req->gnt_handles[seg] = GRANT_REF_INVALID; + + req++; } - return buf; + return (0); } +/** + * Supply information about the physical device to the frontend + * via XenBus. + * + * \param xbb Per-instance xbb configuration structure. + */ static int -vbd_sysctl_handler(SYSCTL_HANDLER_ARGS) +xbb_publish_backend_info(struct xbb_softc *xbb) { - device_t dev = (device_t)arg1; - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); - const char *value; - char *buf = NULL; - int err; - - switch (arg2) { - case VBD_SYSCTL_DOMID: - return sysctl_handle_int(oidp, NULL, blkif->domid, req); - case VBD_SYSCTL_ST_RD_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req); - case VBD_SYSCTL_ST_WR_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req); - case VBD_SYSCTL_ST_OO_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req); - case VBD_SYSCTL_ST_ERR_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req); - case VBD_SYSCTL_RING: - value = buf = vbd_sysctl_ring_info(blkif, arg2); - break; - default: - return (EINVAL); + struct xs_transaction xst; + const char *our_path; + const char *leaf; + int error; + + our_path = xenbus_get_node(xbb->dev); + while (1) { + error = xs_transaction_start(&xst); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Error publishing backend info " + "(start transaction)"); + return (error); + } + + leaf = "sectors"; + error = xs_printf(xst, our_path, leaf, + "%"PRIu64, xbb->media_num_sectors); + if (error != 0) + break; + + /* XXX Support all VBD attributes here. */ + leaf = "info"; + error = xs_printf(xst, our_path, leaf, "%u", + xbb->flags & XBBF_READ_ONLY + ? VDISK_READONLY : 0); + if (error != 0) + break; + + leaf = "sector-size"; + error = xs_printf(xst, our_path, leaf, "%u", + xbb->sector_size); + if (error != 0) + break; + + error = xs_transaction_end(xst, 0); + if (error == 0) { + return (0); + } else if (error != EAGAIN) { + xenbus_dev_fatal(xbb->dev, error, "ending transaction"); + return (error); + } } - err = SYSCTL_OUT(req, value, strlen(value)); - if (buf != NULL) - free(buf, M_DEVBUF); + xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", + our_path, leaf); + xs_transaction_end(xst, 1); + return (error); +} + +/** + * Connect to our blkfront peer now that it has completed publishing + * its configuration into the XenStore. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_connect(struct xbb_softc *xbb) +{ + int error; + + if (xenbus_get_state(xbb->dev) == XenbusStateConnected) + return; + + if (xbb_collect_frontend_info(xbb) != 0) + return; - return err; + /* Allocate resources whose size depends on front-end configuration. */ + error = xbb_alloc_communication_mem(xbb); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Unable to allocate communication memory"); + return; + } + + error = xbb_alloc_requests(xbb); + if (error != 0) { + /* Specific errors are reported by xbb_alloc_requests(). */ + return; + } + + /* + * Connect communication channel. + */ + error = xbb_connect_ring(xbb); + if (error != 0) { + /* Specific errors are reported by xbb_connect_ring(). */ + return; + } + + if (xbb_publish_backend_info(xbb) != 0) { + /* + * If we can't publish our data, we cannot participate + * in this connection, and waiting for a front-end state + * change will not help the situation. + */ + xbb_disconnect(xbb); + return; + } + + /* Ready for I/O. */ + xenbus_set_state(xbb->dev, XenbusStateConnected); } -/* Newbus vbd device driver probe */ +/*-------------------------- Device Teardown Support -------------------------*/ +/** + * Perform device shutdown functions. + * + * \param xbb Per-instance xbb configuration structure. + * + * Mark this instance as shutting down, wait for any active I/O on the + * backend device/file to drain, disconnect from the front-end, and notify + * any waiters (e.g. a thread invoking our detach method) that detach can + * now proceed. + */ static int -vbd_probe(device_t dev) +xbb_shutdown(struct xbb_softc *xbb) { - DPRINTF("vbd%d\n", device_get_unit(dev)); - return 0; + static int in_shutdown; + + DPRINTF("\n"); + + /* + * Due to the need to drop our mutex during some + * xenbus operations, it is possible for two threads + * to attempt to close out shutdown processing at + * the same time. Tell the caller that hits this + * race to try back later. + */ + if (in_shutdown != 0) + return (EAGAIN); + + DPRINTF("\n"); + + /* Indicate shutdown is in progress. */ + xbb->flags |= XBBF_SHUTDOWN; + + /* Wait for requests to complete. */ + if (xbb->active_request_count != 0) + return (EAGAIN); + + DPRINTF("\n"); + + /* Disconnect from the front-end. */ + xbb_disconnect(xbb); + + in_shutdown = 1; + mtx_unlock(&xbb->lock); + xenbus_set_state(xbb->dev, XenbusStateClosed); + mtx_lock(&xbb->lock); + in_shutdown = 0; + + /* Indicate to xbb_detach() that is it safe to proceed. */ + wakeup(xbb); + + return (0); +} + +/** + * Report an attach time error to the console and Xen, and cleanup + * this instance by forcing immediate detach processing. + * + * \param xbb Per-instance xbb configuration structure. + * \param err Errno describing the error. + * \param fmt Printf style format and arguments + */ +static void +xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) +{ + va_list ap; + va_list ap_hotplug; + + va_start(ap, fmt); + va_copy(ap_hotplug, ap); + xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-error", fmt, ap_hotplug); + va_end(ap_hotplug); + xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-status", "error"); + + xenbus_dev_vfatal(xbb->dev, err, fmt, ap); + va_end(ap); + + xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "online", "0"); + xbb_detach(xbb->dev); } -/* Newbus vbd device driver attach */ +/*---------------------------- NewBus Entrypoints ----------------------------*/ +/** + * Inspect a XenBus device and claim it if is of the appropriate type. + * + * \param dev NewBus device object representing a candidate XenBus device. + * + * \return 0 for success, errno codes for failure. + */ static int -vbd_attach(device_t dev) +xbb_probe(device_t dev) { - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); - - DPRINTF("%s\n", blkif->dev_name); - - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I", - "domid of frontend"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I", - "number of read reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I", - "number of write reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I", - "number of deferred reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I", - "number of reqs that returned error"); -#if XEN_BLKBACK_DEBUG - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "ring", CTLFLAG_RD, - dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A", - "req ring info"); -#endif + + if (!strcmp(xenbus_get_type(dev), "vbd")) { + device_set_desc(dev, "Backend Virtual Block Device"); + device_quiet(dev); + return (0); + } + + return (ENXIO); +} - if (!open_device(blkif)) - connect(blkif); +/** + * Attach to a XenBus device that has been claimed by our probe routine. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_attach(device_t dev) +{ + struct xbb_softc *xbb; + int error; + + DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); + + /* + * Basic initialization. + * After this block it is safe to call xbb_detach() + * to clean up any allocated data for this instance. + */ + xbb = device_get_softc(dev); + xbb->dev = dev; + xbb->otherend_id = xenbus_get_otherend_id(dev); + TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); + mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); + SLIST_INIT(&xbb->request_free_slist); + + /* + * Protocol defaults valid even if all negotiation fails. + */ + xbb->ring_config.ring_pages = 1; + xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); + xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; + + /* + * Publish protocol capabilities for consumption by the + * front-end. + */ + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "feature-barrier", "1"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/feature-barrier", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "feature-flush-cache", "1"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-requests", "%u", XBB_MAX_REQUESTS); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-requests", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-request-segments", "%u", + XBB_MAX_SEGMENTS_PER_REQUEST); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-request-segments", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-request-size", "%u", + XBB_MAX_REQUEST_SIZE); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-request-size", + xenbus_get_node(xbb->dev)); + return (error); + } + + /* Collect physical device information. */ + error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), + "device-type", NULL, &xbb->dev_type, + NULL); + if (error != 0) + xbb->dev_type = NULL; + + error = xs_gather(XST_NIL, xenbus_get_node(dev), + "mode", NULL, &xbb->dev_mode, + "params", NULL, &xbb->dev_name, + NULL); + if (error != 0) { + xbb_attach_failed(xbb, error, "reading backend fields at %s", + xenbus_get_node(dev)); + return (ENXIO); + } + + /* Parse fopen style mode flags. */ + if (strchr(xbb->dev_mode, 'w') == NULL) + xbb->flags |= XBBF_READ_ONLY; + + /* + * Verify the physical device is present and can support + * the desired I/O mode. + */ + DROP_GIANT(); + error = xbb_open_backend(xbb); + PICKUP_GIANT(); + if (error != 0) { + xbb_attach_failed(xbb, error, "Unable to open %s", + xbb->dev_name); + return (ENXIO); + } - return bus_generic_attach(dev); + /* Use devstat(9) for recording statistics. */ + xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), + xbb->sector_size, + DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT + | DEVSTAT_TYPE_IF_OTHER, + DEVSTAT_PRIORITY_OTHER); + /* + * Create a taskqueue for doing work that must occur from a + * thread context. + */ + xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, + taskqueue_thread_enqueue, + /*context*/&xbb->io_taskqueue); + if (xbb->io_taskqueue == NULL) { + xbb_attach_failed(xbb, error, "Unable to create taskqueue"); + return (ENOMEM); + } + + taskqueue_start_threads(&xbb->io_taskqueue, + /*num threads*/1, + /*priority*/PWAIT, + /*thread name*/ + "%s taskq", device_get_nameunit(dev)); + + /* Update hot-plug status to satisfy xend. */ + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-status", "connected"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/hotplug-status", + xenbus_get_node(xbb->dev)); + return (error); + } + + /* Tell the front end that we are ready to connect. */ + xenbus_set_state(dev, XenbusStateInitWait); + + return (0); } -/* Newbus vbd device driver detach */ +/** + * Detach from a block back device instanced. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + * + * \note A block back device may be detached at any time in its life-cycle, + * including part way through the attach process. For this reason, + * initialization order and the intialization state checks in this + * routine must be carefully coupled so that attach time failures + * are gracefully handled. + */ static int -vbd_detach(device_t dev) +xbb_detach(device_t dev) { - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + struct xbb_softc *xbb; - DPRINTF("%s\n", blkif->dev_name); + DPRINTF("\n"); - close_device(blkif); + xbb = device_get_softc(dev); + mtx_lock(&xbb->lock); + while (xbb_shutdown(xbb) == EAGAIN) { + msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, + "xbb_shutdown", 0); + } + mtx_unlock(&xbb->lock); + mtx_destroy(&xbb->lock); - bus_generic_detach(dev); + DPRINTF("\n"); - blkif_put(blkif); + taskqueue_free(xbb->io_taskqueue); + devstat_remove_entry(xbb->xbb_stats); - return 0; + xbb_close_backend(xbb); + xbb_free_communication_mem(xbb); + + if (xbb->dev_mode != NULL) { + free(xbb->dev_mode, M_XENBUS); + xbb->dev_mode = NULL; + } + + if (xbb->dev_type != NULL) { + free(xbb->dev_type, M_XENBUS); + xbb->dev_type = NULL; + } + + if (xbb->dev_name != NULL) { + free(xbb->dev_name, M_XENBUS); + xbb->dev_name = NULL; + } + + if (xbb->requests != NULL) { + struct xbb_xen_req *req; + struct xbb_xen_req *last_req; + + req = xbb->requests; + last_req = &xbb->requests[xbb->max_requests - 1]; + while (req <= last_req) { +#ifdef XBB_USE_BOUNCE_BUFFERS + if (req->bounce != NULL) { + free(req->bounce, M_XENBLOCKBACK); + req->bounce = NULL; + } +#endif + if (req->gnt_handles != NULL) { + free (req->gnt_handles, M_XENBLOCKBACK); + req->gnt_handles = NULL; + } + req++; + } + free(xbb->requests, M_XENBLOCKBACK); + xbb->requests = NULL; + } + + return (0); } -static device_method_t vbd_methods[] = { +/** + * Prepare this block back device for suspension of this VM. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_suspend(device_t dev) +{ +#ifdef NOT_YET + struct xbb_softc *sc = device_get_softc(dev); + + /* Prevent new requests being issued until we fix things up. */ + mtx_lock(&sc->xb_io_lock); + sc->connected = BLKIF_STATE_SUSPENDED; + mtx_unlock(&sc->xb_io_lock); +#endif + + return (0); +} + +/** + * Perform any processing required to recover from a suspended state. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_resume(device_t dev) +{ + return (0); +} + +/** + * Handle state changes expressed via the XenStore by our front-end peer. + * + * \param dev NewBus device object representing this Xen + * Block Back instance. + * \param frontend_state The new state of the front-end. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_frontend_changed(device_t dev, XenbusState frontend_state) +{ + struct xbb_softc *xbb = device_get_softc(dev); + + DPRINTF("state=%s\n", xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + case XenbusStateClosing: + break; + case XenbusStateInitialised: + case XenbusStateConnected: + xbb_connect(xbb); + break; + case XenbusStateClosed: + case XenbusStateInitWait: + + mtx_lock(&xbb->lock); + xbb_shutdown(xbb); + mtx_unlock(&xbb->lock); + break; + default: + xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } + return (0); +} + +/*---------------------------- NewBus Registration ---------------------------*/ +static device_method_t xbb_methods[] = { /* Device interface */ - DEVMETHOD(device_probe, vbd_probe), - DEVMETHOD(device_attach, vbd_attach), - DEVMETHOD(device_detach, vbd_detach), + DEVMETHOD(device_probe, xbb_probe), + DEVMETHOD(device_attach, xbb_attach), + DEVMETHOD(device_detach, xbb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), - DEVMETHOD(device_suspend, bus_generic_suspend), - DEVMETHOD(device_resume, bus_generic_resume), - {0, 0} -}; + DEVMETHOD(device_suspend, xbb_suspend), + DEVMETHOD(device_resume, xbb_resume), -static devclass_t vbd_devclass; + /* Xenbus interface */ + DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), -static driver_t vbd_driver = { - "vbd", - vbd_methods, - 0, + { 0, 0 } }; -DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0); +static driver_t xbb_driver = { + "xbbd", + xbb_methods, + sizeof(struct xbb_softc), +}; +devclass_t xbb_devclass; -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: t - * End: - */ +DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c index 6c222ea..8ff8757 100644 --- a/sys/dev/xen/blkfront/blkfront.c +++ b/sys/dev/xen/blkfront/blkfront.c @@ -49,8 +49,10 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> #include <sys/bus_dma.h> +#include <machine/_inttypes.h> #include <machine/xen/xen-os.h> #include <machine/xen/xenfunc.h> + #include <xen/hypervisor.h> #include <xen/xen_intr.h> #include <xen/evtchn.h> @@ -68,17 +70,21 @@ __FBSDID("$FreeBSD$"); /* prototypes */ static void xb_free_command(struct xb_command *cm); static void xb_startio(struct xb_softc *sc); -static void connect(struct xb_softc *); +static void blkfront_connect(struct xb_softc *); static void blkfront_closing(device_t); static int blkfront_detach(device_t); -static int talk_to_backend(struct xb_softc *); static int setup_blkring(struct xb_softc *); static void blkif_int(void *); +static void blkfront_initialize(struct xb_softc *); +#if 0 static void blkif_recover(struct xb_softc *); -static void blkif_completion(struct xb_command *); +#endif +static int blkif_completion(struct xb_command *); static void blkif_free(struct xb_softc *, int); static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int); +MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); + #define GRANT_INVALID_REF 0 /* Control whether runtime update of vbds is enabled. */ @@ -113,11 +119,6 @@ static char * blkif_status_name[] = { #define DPRINTK(fmt, args...) #endif -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) - -#define BLKIF_MAXIO (32 * 1024) - static int blkif_open(struct disk *dp); static int blkif_close(struct disk *dp); static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td); @@ -202,8 +203,8 @@ blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name) } int -xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity, - int vdevice, uint16_t vdisk_info, uint16_t sector_size) +xlvbd_add(struct xb_softc *sc, blkif_sector_t sectors, + int vdevice, uint16_t vdisk_info, unsigned long sector_size) { int unit, error = 0; const char *name; @@ -215,7 +216,6 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity, if (strcmp(name, "xbd")) device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit); - memset(&sc->xb_disk, 0, sizeof(sc->xb_disk)); sc->xb_disk = disk_alloc(); sc->xb_disk->d_unit = sc->xb_unit; sc->xb_disk->d_open = blkif_open; @@ -227,20 +227,14 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity, sc->xb_disk->d_drv1 = sc; sc->xb_disk->d_sectorsize = sector_size; - sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT; - sc->xb_disk->d_maxsize = BLKIF_MAXIO; + sc->xb_disk->d_mediasize = sectors * sector_size; + sc->xb_disk->d_maxsize = sc->max_request_size; sc->xb_disk->d_flags = 0; disk_create(sc->xb_disk, DISK_VERSION_00); return error; } -void -xlvbd_del(struct xb_softc *sc) -{ - - disk_destroy(sc->xb_disk); -} /************************ end VBD support *****************/ /* @@ -357,15 +351,16 @@ xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, return (EBUSY); } - if (gnttab_alloc_grant_references( - BLKIF_MAX_SEGMENTS_PER_REQUEST, &cm->gref_head) < 0) { + if (gnttab_alloc_grant_references(sc->max_request_segments, + &cm->gref_head) != 0) { xb_free_command(cm); mtx_unlock(&sc->xb_io_lock); device_printf(sc->xb_dev, "no more grant allocs?\n"); return (EBUSY); } - chunk = length > BLKIF_MAXIO ? BLKIF_MAXIO : length; + chunk = length > sc->max_request_size + ? sc->max_request_size : length; cm->data = virtual; cm->datalen = chunk; cm->operation = BLKIF_OP_WRITE; @@ -423,16 +418,18 @@ static int blkfront_attach(device_t dev) { struct xb_softc *sc; - struct xb_command *cm; const char *name; - int error, vdevice, i, unit; + int error; + int vdevice; + int i; + int unit; /* FIXME: Use dynamic device id if this is not set. */ - error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev), + error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device", NULL, "%i", &vdevice); if (error) { xenbus_dev_fatal(dev, error, "reading virtual-device"); - printf("couldn't find virtual device"); + device_printf(dev, "Couldn't determine virtual device.\n"); return (error); } @@ -447,51 +444,18 @@ blkfront_attach(device_t dev) xb_initq_ready(sc); xb_initq_complete(sc); xb_initq_bio(sc); - - /* Allocate parent DMA tag */ - if (bus_dma_tag_create( NULL, /* parent */ - 512, 4096, /* algnmnt, boundary */ - BUS_SPACE_MAXADDR, /* lowaddr */ - BUS_SPACE_MAXADDR, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - BLKIF_MAXIO, /* maxsize */ - BLKIF_MAX_SEGMENTS_PER_REQUEST, /* nsegments */ - PAGE_SIZE, /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ - busdma_lock_mutex, /* lockfunc */ - &sc->xb_io_lock, /* lockarg */ - &sc->xb_io_dmat)) { - device_printf(dev, "Cannot allocate parent DMA tag\n"); - return (ENOMEM); - } -#ifdef notyet - if (bus_dma_tag_set(sc->xb_io_dmat, BUS_DMA_SET_MINSEGSZ, - XBD_SECTOR_SIZE)) { - device_printf(dev, "Cannot set sector size\n"); - return (EINVAL); - } -#endif + for (i = 0; i < XBF_MAX_RING_PAGES; i++) + sc->ring_ref[i] = GRANT_INVALID_REF; sc->xb_dev = dev; sc->vdevice = vdevice; sc->connected = BLKIF_STATE_DISCONNECTED; - /* work queue needed ? */ - for (i = 0; i < BLK_RING_SIZE; i++) { - cm = &sc->shadow[i]; - cm->req.id = i; - cm->cm_sc = sc; - if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0) - break; - xb_free_command(cm); - } - /* Front end dir is a number, which is used as the id. */ sc->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0); - error = talk_to_backend(sc); - if (error) - return (error); + /* Wait for backend device to publish its protocol capabilities. */ + xenbus_set_state(dev, XenbusStateInitialising); return (0); } @@ -512,121 +476,265 @@ blkfront_suspend(device_t dev) static int blkfront_resume(device_t dev) { +#if 0 struct xb_softc *sc = device_get_softc(dev); - int err; DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev)); +/* XXX This can't work!!! */ blkif_free(sc, 1); - err = talk_to_backend(sc); - if (sc->connected == BLKIF_STATE_SUSPENDED && !err) + blkfront_initialize(sc); + if (sc->connected == BLKIF_STATE_SUSPENDED) blkif_recover(sc); - - return (err); +#endif + return (0); } -/* Common code used when first setting up, and when resuming. */ -static int -talk_to_backend(struct xb_softc *sc) +static void +blkfront_initialize(struct xb_softc *sc) { - device_t dev; - struct xenbus_transaction xbt; - const char *message = NULL; - int err; + const char *otherend_path; + const char *node_path; + int error; + int i; - /* Create shared ring, alloc event channel. */ - dev = sc->xb_dev; - err = setup_blkring(sc); - if (err) - goto out; + if (xenbus_get_state(sc->xb_dev) != XenbusStateInitialising) + return; - again: - err = xenbus_transaction_start(&xbt); - if (err) { - xenbus_dev_fatal(dev, err, "starting transaction"); - goto destroy_blkring; + /* + * Protocol defaults valid even if negotiation for a + * setting fails. + */ + sc->ring_pages = 1; + sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); + sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + sc->max_request_size = sc->max_request_segments * PAGE_SIZE; + sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments); + + /* + * Protocol negotiation. + * + * \note xs_gather() returns on the first encountered error, so + * we must use independant calls in order to guarantee + * we don't miss information in a sparsly populated back-end + * tree. + */ + otherend_path = xenbus_get_otherend_path(sc->xb_dev); + node_path = xenbus_get_node(sc->xb_dev); + (void)xs_scanf(XST_NIL, otherend_path, + "max-ring-pages", NULL, "%" PRIu32, + &sc->ring_pages); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-requests", NULL, "%" PRIu32, + &sc->max_requests); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-segments", NULL, "%" PRIu32, + &sc->max_request_segments); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-size", NULL, "%" PRIu32, + &sc->max_request_size); + + if (sc->ring_pages > XBF_MAX_RING_PAGES) { + device_printf(sc->xb_dev, "Back-end specified ring-pages of " + "%u limited to front-end limit of %zu.\n", + sc->ring_pages, XBF_MAX_RING_PAGES); + sc->ring_pages = XBF_MAX_RING_PAGES; } - err = xenbus_printf(xbt, xenbus_get_node(dev), - "ring-ref","%u", sc->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (sc->max_requests > XBF_MAX_REQUESTS) { + device_printf(sc->xb_dev, "Back-end specified max_requests of " + "%u limited to front-end limit of %u.\n", + sc->max_requests, XBF_MAX_REQUESTS); + sc->max_requests = XBF_MAX_REQUESTS; } - err = xenbus_printf(xbt, xenbus_get_node(dev), - "event-channel", "%u", irq_to_evtchn_port(sc->irq)); - if (err) { - message = "writing event-channel"; - goto abort_transaction; + + if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) { + device_printf(sc->xb_dev, "Back-end specificed " + "max_requests_segments of %u limited to " + "front-end limit of %u.\n", + sc->max_request_segments, + XBF_MAX_SEGMENTS_PER_REQUEST); + sc->max_request_segments = XBF_MAX_SEGMENTS_PER_REQUEST; } - err = xenbus_printf(xbt, xenbus_get_node(dev), - "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); - if (err) { - message = "writing protocol"; - goto abort_transaction; + + if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) { + device_printf(sc->xb_dev, "Back-end specificed " + "max_request_size of %u limited to front-end " + "limit of %u.\n", sc->max_request_size, + XBF_MAX_REQUEST_SIZE); + sc->max_request_size = XBF_MAX_REQUEST_SIZE; + } + sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments); + + /* Allocate datastructures based on negotiated values. */ + error = bus_dma_tag_create(NULL, /* parent */ + 512, PAGE_SIZE, /* algnmnt, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + sc->max_request_size, + sc->max_request_segments, + PAGE_SIZE, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + busdma_lock_mutex, /* lockfunc */ + &sc->xb_io_lock, /* lockarg */ + &sc->xb_io_dmat); + if (error != 0) { + xenbus_dev_fatal(sc->xb_dev, error, + "Cannot allocate parent DMA tag\n"); + return; } - err = xenbus_transaction_end(xbt, 0); - if (err) { - if (err == EAGAIN) - goto again; - xenbus_dev_fatal(dev, err, "completing transaction"); - goto destroy_blkring; + /* Per-transaction data allocation. */ + sc->shadow = malloc(sizeof(*sc->shadow) * sc->max_requests, + M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); + if (sc->shadow == NULL) { + xenbus_dev_fatal(sc->xb_dev, error, + "Cannot allocate request structures\n"); } - xenbus_set_state(dev, XenbusStateInitialised); - - return 0; - abort_transaction: - xenbus_transaction_end(xbt, 1); - if (message) - xenbus_dev_fatal(dev, err, "%s", message); - destroy_blkring: - blkif_free(sc, 0); - out: - return err; + for (i = 0; i < sc->max_requests; i++) { + struct xb_command *cm; + + cm = &sc->shadow[i]; + cm->sg_refs = malloc(sizeof(grant_ref_t) + * sc->max_request_segments, + M_XENBLOCKFRONT, M_NOWAIT); + if (cm->sg_refs == NULL) + break; + cm->id = i; + cm->cm_sc = sc; + if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0) + break; + xb_free_command(cm); + } + + if (setup_blkring(sc) != 0) + return; + + error = xs_printf(XST_NIL, node_path, + "ring-pages","%u", sc->ring_pages); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/ring-pages", + node_path); + return; + } + + error = xs_printf(XST_NIL, node_path, + "max-requests","%u", sc->max_requests); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/max-requests", + node_path); + return; + } + + error = xs_printf(XST_NIL, node_path, + "max-request-segments","%u", sc->max_request_segments); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/max-request-segments", + node_path); + return; + } + + error = xs_printf(XST_NIL, node_path, + "max-request-size","%u", sc->max_request_size); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/max-request-size", + node_path); + return; + } + + error = xs_printf(XST_NIL, node_path, "event-channel", + "%u", irq_to_evtchn_port(sc->irq)); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/event-channel", + node_path); + return; + } + + error = xs_printf(XST_NIL, node_path, + "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/protocol", + node_path); + return; + } + + xenbus_set_state(sc->xb_dev, XenbusStateInitialised); } static int setup_blkring(struct xb_softc *sc) { blkif_sring_t *sring; + uintptr_t sring_page_addr; int error; + int i; - sc->ring_ref = GRANT_INVALID_REF; - - sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + sring = malloc(sc->ring_pages * PAGE_SIZE, M_XENBLOCKFRONT, + M_NOWAIT|M_ZERO); if (sring == NULL) { xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring"); - return ENOMEM; + return (ENOMEM); } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&sc->ring, sring, PAGE_SIZE); - - error = xenbus_grant_ring(sc->xb_dev, - (vtomach(sc->ring.sring) >> PAGE_SHIFT), &sc->ring_ref); + FRONT_RING_INIT(&sc->ring, sring, sc->ring_pages * PAGE_SIZE); + + for (i = 0, sring_page_addr = (uintptr_t)sring; + i < sc->ring_pages; + i++, sring_page_addr += PAGE_SIZE) { + + error = xenbus_grant_ring(sc->xb_dev, + (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->ring_ref[i]); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "granting ring_ref(%d)", i); + return (error); + } + } + error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev), + "ring-ref","%u", sc->ring_ref[0]); if (error) { - free(sring, M_DEVBUF); - sc->ring.sring = NULL; - goto fail; + xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref", + xenbus_get_node(sc->xb_dev)); + return (error); } - - error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(sc->xb_dev), + for (i = 1; i < sc->ring_pages; i++) { + char ring_ref_name[]= "ring_refXX"; + + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i); + error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev), + ring_ref_name, "%u", sc->ring_ref[i]); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s", + xenbus_get_node(sc->xb_dev), + ring_ref_name); + return (error); + } + } + + error = bind_listening_port_to_irqhandler( + xenbus_get_otherend_id(sc->xb_dev), "xbd", (driver_intr_t *)blkif_int, sc, INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq); if (error) { xenbus_dev_fatal(sc->xb_dev, error, "bind_evtchn_to_irqhandler failed"); - goto fail; + return (error); } return (0); - fail: - blkif_free(sc, 0); - return (error); } - /** * Callback received when the backend's state changes. */ @@ -640,15 +748,19 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state) switch (backend_state) { case XenbusStateUnknown: case XenbusStateInitialising: - case XenbusStateInitWait: - case XenbusStateInitialised: - case XenbusStateClosed: case XenbusStateReconfigured: case XenbusStateReconfiguring: + case XenbusStateClosed: break; + case XenbusStateInitWait: + blkfront_initialize(sc); + break; + + case XenbusStateInitialised: case XenbusStateConnected: - connect(sc); + blkfront_initialize(sc); + blkfront_connect(sc); break; case XenbusStateClosing: @@ -657,20 +769,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state) "Device in use; refusing to close"); else blkfront_closing(dev); -#ifdef notyet - bd = bdget(sc->dev); - if (bd == NULL) - xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); - - down(&bd->bd_sem); - if (sc->users > 0) - xenbus_dev_error(dev, -EBUSY, - "Device in use; refusing to close"); - else - blkfront_closing(dev); - up(&bd->bd_sem); - bdput(bd); -#endif + break; } return (0); @@ -681,7 +780,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state) ** the details about the physical device - #sectors, size, etc). */ static void -connect(struct xb_softc *sc) +blkfront_connect(struct xb_softc *sc) { device_t dev = sc->xb_dev; unsigned long sectors, sector_size; @@ -694,20 +793,20 @@ connect(struct xb_softc *sc) DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev)); - err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev), - "sectors", "%lu", §ors, - "info", "%u", &binfo, - "sector-size", "%lu", §or_size, - NULL); + err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), + "sectors", "%lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); if (err) { xenbus_dev_fatal(dev, err, "reading backend fields at %s", xenbus_get_otherend_path(dev)); return; } - err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev), - "feature-barrier", "%lu", &feature_barrier, - NULL); + err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), + "feature-barrier", "%lu", &feature_barrier, + NULL); if (!err || feature_barrier) sc->xb_flags |= XB_BARRIER; @@ -741,15 +840,16 @@ blkfront_closing(device_t dev) { struct xb_softc *sc = device_get_softc(dev); + xenbus_set_state(dev, XenbusStateClosing); + DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev)); - if (sc->mi) { - DPRINTK("Calling xlvbd_del\n"); - xlvbd_del(sc); - sc->mi = NULL; + if (sc->xb_disk != NULL) { + disk_destroy(sc->xb_disk); + sc->xb_disk = NULL; } - xenbus_set_state(dev, XenbusStateClosed); + xenbus_set_state(dev, XenbusStateClosed); } @@ -778,11 +878,16 @@ flush_requests(struct xb_softc *sc) notify_remote_via_irq(sc->irq); } -static void blkif_restart_queue_callback(void *arg) +static void +blkif_restart_queue_callback(void *arg) { struct xb_softc *sc = arg; + mtx_lock(&sc->xb_io_lock); + xb_startio(sc); + + mtx_unlock(&sc->xb_io_lock); } static int @@ -874,20 +979,17 @@ xb_bio_command(struct xb_softc *sc) return (NULL); } - if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST, - &cm->gref_head) < 0) { + if (gnttab_alloc_grant_references(sc->max_request_segments, + &cm->gref_head) != 0) { gnttab_request_free_callback(&sc->callback, blkif_restart_queue_callback, sc, - BLKIF_MAX_SEGMENTS_PER_REQUEST); + sc->max_request_segments); xb_requeue_bio(sc, bp); xb_enqueue_free(cm); sc->xb_flags |= XB_FROZEN; return (NULL); } - /* XXX Can we grab refs before doing the load so that the ref can - * be filled out here? - */ cm->bp = bp; cm->data = bp->bio_data; cm->datalen = bp->bio_bcount; @@ -921,13 +1023,19 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) struct xb_softc *sc; struct xb_command *cm; blkif_request_t *ring_req; + struct blkif_request_segment *sg; + struct blkif_request_segment *last_block_sg; + grant_ref_t *sg_ref; vm_paddr_t buffer_ma; uint64_t fsect, lsect; - int ref, i, op; + int ref; + int op; + int block_segs; cm = arg; sc = cm->cm_sc; +//printf("%s: Start\n", __func__); if (error) { printf("error %d in blkif_queue_cb\n", error); cm->bp->bio_error = EIO; @@ -938,43 +1046,62 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt); - if (ring_req == NULL) { - /* XXX Is this possible? */ - printf("ring_req NULL, requeuing\n"); - xb_enqueue_ready(cm); - return; - } - ring_req->id = cm->req.id; + sc->ring.req_prod_pvt++; + ring_req->id = cm->id; ring_req->operation = cm->operation; ring_req->sector_number = cm->sector_number; ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk; ring_req->nr_segments = nsegs; + cm->nseg = nsegs; + + block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); + sg = ring_req->seg; + last_block_sg = sg + block_segs; + sg_ref = cm->sg_refs; + + while (1) { - for (i = 0; i < nsegs; i++) { - buffer_ma = segs[i].ds_addr; - fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; - lsect = fsect + (segs[i].ds_len >> XBD_SECTOR_SHFT) - 1; + while (sg < last_block_sg) { + buffer_ma = segs->ds_addr; + fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; + lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1; - KASSERT(lsect <= 7, - ("XEN disk driver data cannot cross a page boundary")); + KASSERT(lsect <= 7, ("XEN disk driver data cannot " + "cross a page boundary")); - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&cm->gref_head); - KASSERT( ref >= 0, ("grant_reference failed") ); + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&cm->gref_head); + + /* + * GNTTAB_LIST_END == 0xffffffff, but it is private + * to gnttab.c. + */ + KASSERT(ref != ~0, ("grant_reference failed")); - gnttab_grant_foreign_access_ref( - ref, - xenbus_get_otherend_id(sc->xb_dev), - buffer_ma >> PAGE_SHIFT, - ring_req->operation & 1 ); /* ??? */ + gnttab_grant_foreign_access_ref( + ref, + xenbus_get_otherend_id(sc->xb_dev), + buffer_ma >> PAGE_SHIFT, + ring_req->operation == BLKIF_OP_WRITE); - ring_req->seg[i] = - (struct blkif_request_segment) { + *sg_ref = ref; + *sg = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; - } + sg++; + sg_ref++; + segs++; + nsegs--; + } + block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); + if (block_segs == 0) + break; + sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt); + sc->ring.req_prod_pvt++; + last_block_sg = sg + block_segs; + } if (cm->operation == BLKIF_OP_READ) op = BUS_DMASYNC_PREREAD; @@ -984,15 +1111,10 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) op = 0; bus_dmamap_sync(sc->xb_io_dmat, cm->map, op); - sc->ring.req_prod_pvt++; - - /* Keep a private copy so we can reissue requests when recovering. */ - cm->req = *ring_req; + gnttab_free_grant_references(cm->gref_head); xb_enqueue_busy(cm); - gnttab_free_grant_references(cm->gref_head); - /* * This flag means that we're probably executing in the busdma swi * instead of in the startio context, so an explicit flush is needed. @@ -1000,6 +1122,7 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) if (cm->cm_flags & XB_CMD_FROZEN) flush_requests(sc); +//printf("%s: Done\n", __func__); return; } @@ -1018,7 +1141,7 @@ xb_startio(struct xb_softc *sc) mtx_assert(&sc->xb_io_lock, MA_OWNED); - while (!RING_FULL(&sc->ring)) { + while (RING_FREE_REQUESTS(&sc->ring) >= sc->max_request_blocks) { if (sc->xb_flags & XB_FROZEN) break; @@ -1061,12 +1184,12 @@ blkif_int(void *xsc) rp = sc->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = sc->ring.rsp_cons; i != rp; i++) { + for (i = sc->ring.rsp_cons; i != rp;) { bret = RING_GET_RESPONSE(&sc->ring, i); cm = &sc->shadow[bret->id]; xb_remove_busy(cm); - blkif_completion(cm); + i += blkif_completion(cm); if (cm->operation == BLKIF_OP_READ) op = BUS_DMASYNC_POSTREAD; @@ -1116,35 +1239,61 @@ blkif_int(void *xsc) static void blkif_free(struct xb_softc *sc, int suspend) { + uint8_t *sring_page_ptr; + int i; -/* Prevent new requests being issued until we fix things up. */ + /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xb_io_lock); sc->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; mtx_unlock(&sc->xb_io_lock); /* Free resources associated with old device channel. */ - if (sc->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(sc->ring_ref, - sc->ring.sring); - sc->ring_ref = GRANT_INVALID_REF; + if (sc->ring.sring != NULL) { + sring_page_ptr = (uint8_t *)sc->ring.sring; + for (i = 0; i < sc->ring_pages; i++) { + if (sc->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access_ref(sc->ring_ref[i]); + sc->ring_ref[i] = GRANT_INVALID_REF; + } + sring_page_ptr += PAGE_SIZE; + } + free(sc->ring.sring, M_XENBLOCKFRONT); sc->ring.sring = NULL; } - if (sc->irq) - unbind_from_irqhandler(sc->irq); - sc->irq = 0; + if (sc->shadow) { + + for (i = 0; i < sc->max_requests; i++) { + struct xb_command *cm; + + cm = &sc->shadow[i]; + if (cm->sg_refs != NULL) { + free(cm->sg_refs, M_XENBLOCKFRONT); + cm->sg_refs = NULL; + } + + bus_dmamap_destroy(sc->xb_io_dmat, cm->map); + } + free(sc->shadow, M_XENBLOCKFRONT); + sc->shadow = NULL; + } + + if (sc->irq) { + unbind_from_irqhandler(sc->irq); + sc->irq = 0; + } } -static void +static int blkif_completion(struct xb_command *s) { - int i; - - for (i = 0; i < s->req.nr_segments; i++) - gnttab_end_foreign_access(s->req.seg[i].gref, 0UL); +//printf("%s: Req %p(%d)\n", __func__, s, s->nseg); + gnttab_end_foreign_access_references(s->nseg, s->sg_refs); + return (BLKIF_SEGS_TO_BLOCKS(s->nseg)); } +#if 0 static void blkif_recover(struct xb_softc *sc) { @@ -1157,6 +1306,7 @@ blkif_recover(struct xb_softc *sc) * has been removed until further notice. */ } +#endif /* ** Driver registration ** */ static device_method_t blkfront_methods[] = { @@ -1169,7 +1319,7 @@ static device_method_t blkfront_methods[] = { DEVMETHOD(device_resume, blkfront_resume), /* Xenbus interface */ - DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed), + DEVMETHOD(xenbus_otherend_changed, blkfront_backend_changed), { 0, 0 } }; @@ -1181,4 +1331,4 @@ static driver_t blkfront_driver = { }; devclass_t blkfront_devclass; -DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0); +DRIVER_MODULE(xbd, xenbusb_front, blkfront_driver, blkfront_devclass, 0, 0); diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h index 32bfc96..6235e51 100644 --- a/sys/dev/xen/blkfront/block.h +++ b/sys/dev/xen/blkfront/block.h @@ -32,7 +32,43 @@ #ifndef __XEN_DRIVERS_BLOCK_H__ #define __XEN_DRIVERS_BLOCK_H__ -#include <xen/interface/io/blkif.h> +#include <xen/blkif.h> + +/** + * The maximum number of outstanding requests blocks (request headers plus + * additional segment blocks) we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBF_MAX_REQUESTS 256 + +/** + * The maximum mapped region size per request we will allow in a negotiated + * block-front/back communication channel. + * + * \note We reserve a segement from the maximum supported by the transport to + * guarantee we can handle an unaligned transfer without the need to + * use a bounce buffer.. + */ +#define XBF_MAX_REQUEST_SIZE \ + MIN(MAXPHYS, (BLKIF_MAX_SEGMENTS_PER_REQUEST - 1) * PAGE_SIZE) + +/** + * The maximum number of segments (within a request header and accompanying + * segment blocks) per request we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBF_MAX_SEGMENTS_PER_REQUEST \ + (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ + (XBF_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)) + +/** + * The maximum number of shared memory ring pages we will allow in a + * negotiated block-front/back communication channel. Allow enough + * ring space for all requests to be XBF_MAX_REQUEST_SIZE'd. + */ +#define XBF_MAX_RING_PAGES \ + BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBF_MAX_SEGMENTS_PER_REQUEST) \ + * XBF_MAX_REQUESTS) struct xlbd_type_info { @@ -62,19 +98,19 @@ struct xb_command { #define XB_ON_XBQ_COMPLETE (1<<5) #define XB_ON_XBQ_MASK ((1<<2)|(1<<3)|(1<<4)|(1<<5)) bus_dmamap_t map; - blkif_request_t req; + uint64_t id; + grant_ref_t *sg_refs; struct bio *bp; grant_ref_t gref_head; void *data; size_t datalen; + u_int nseg; int operation; blkif_sector_t sector_number; int status; void (* cm_complete)(struct xb_command *); }; -#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) - #define XBQ_FREE 0 #define XBQ_BIO 1 #define XBQ_READY 2 @@ -108,10 +144,14 @@ struct xb_softc { int vdevice; blkif_vdev_t handle; int connected; - int ring_ref; + u_int ring_pages; + uint32_t max_requests; + uint32_t max_request_segments; + uint32_t max_request_blocks; + uint32_t max_request_size; + grant_ref_t ring_ref[XBF_MAX_RING_PAGES]; blkif_front_ring_t ring; unsigned int irq; - struct xlbd_major_info *mi; struct gnttab_free_callback callback; TAILQ_HEAD(,xb_command) cm_free; TAILQ_HEAD(,xb_command) cm_ready; @@ -126,11 +166,12 @@ struct xb_softc { */ int users; struct mtx xb_io_lock; - struct xb_command shadow[BLK_RING_SIZE]; + + struct xb_command *shadow; }; -int xlvbd_add(struct xb_softc *, blkif_sector_t capacity, int device, - uint16_t vdisk_info, uint16_t sector_size); +int xlvbd_add(struct xb_softc *, blkif_sector_t sectors, int device, + uint16_t vdisk_info, unsigned long sector_size); void xlvbd_del(struct xb_softc *); #define XBQ_ADD(sc, qname) \ @@ -188,7 +229,8 @@ void xlvbd_del(struct xb_softc *); struct xb_command *cm; \ \ if ((cm = TAILQ_FIRST(&sc->cm_ ## name)) != NULL) { \ - if ((cm->cm_flags & XB_ON_ ## index) == 0) { \ + if ((cm->cm_flags & XB_ON_XBQ_MASK) != \ + XB_ON_ ## index) { \ printf("command %p not in queue, " \ "flags = %#x, bit = %#x\n", cm, \ cm->cm_flags, XB_ON_ ## index); \ @@ -203,7 +245,7 @@ void xlvbd_del(struct xb_softc *); static __inline void \ xb_remove_ ## name (struct xb_command *cm) \ { \ - if ((cm->cm_flags & XB_ON_ ## index) == 0) { \ + if ((cm->cm_flags & XB_ON_XBQ_MASK) != XB_ON_ ## index){\ printf("command %p not in queue, flags = %#x, " \ "bit = %#x\n", cm, cm->cm_flags, \ XB_ON_ ## index); \ diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c new file mode 100644 index 0000000..c03d536 --- /dev/null +++ b/sys/dev/xen/control/control.c @@ -0,0 +1,493 @@ +/*- + * Copyright (c) 2010 Justin T. Gibbs, Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + */ + +/*- + * PV suspend/resume support: + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006,2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * HVM suspend/resume support: + * + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/** + * \file control.c + * + * \brief Device driver to repond to control domain events that impact + * this VM. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <sys/bio.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/disk.h> +#include <sys/fcntl.h> +#include <sys/filedesc.h> +#include <sys/kdb.h> +#include <sys/module.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/reboot.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/types.h> +#include <sys/vnode.h> + +#ifndef XENHVM +#include <sys/sched.h> +#include <sys/smp.h> +#endif + + +#include <geom/geom.h> + +#include <machine/_inttypes.h> +#include <machine/xen/xen-os.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> + +#include <xen/blkif.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/xen_intr.h> + +#include <xen/interface/event_channel.h> +#include <xen/interface/grant_table.h> + +#include <xen/xenbus/xenbusvar.h> + +#define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*(x))) + +/*--------------------------- Forward Declarations --------------------------*/ +/** Function signature for shutdown event handlers. */ +typedef void (xctrl_shutdown_handler_t)(void); + +static xctrl_shutdown_handler_t xctrl_poweroff; +static xctrl_shutdown_handler_t xctrl_reboot; +static xctrl_shutdown_handler_t xctrl_suspend; +static xctrl_shutdown_handler_t xctrl_crash; +static xctrl_shutdown_handler_t xctrl_halt; + +/*-------------------------- Private Data Structures -------------------------*/ +/** Element type for lookup table of event name to handler. */ +struct xctrl_shutdown_reason { + const char *name; + xctrl_shutdown_handler_t *handler; +}; + +/** Lookup table for shutdown event name to handler. */ +static struct xctrl_shutdown_reason xctrl_shutdown_reasons[] = { + { "poweroff", xctrl_poweroff }, + { "reboot", xctrl_reboot }, + { "suspend", xctrl_suspend }, + { "crash", xctrl_crash }, + { "halt", xctrl_halt }, +}; + +struct xctrl_softc { + + /** Must be first */ + struct xs_watch xctrl_watch; +}; + +/*------------------------------ Event Handlers ------------------------------*/ +static void +xctrl_poweroff() +{ + shutdown_nice(RB_POWEROFF|RB_HALT); +} + +static void +xctrl_reboot() +{ + shutdown_nice(0); +} + +#ifndef XENHVM +extern void xencons_suspend(void); +extern void xencons_resume(void); + +/* Full PV mode suspension. */ +static void +xctrl_suspend() +{ + int i, j, k, fpp; + unsigned long max_pfn, start_info_mfn; + +#ifdef SMP + cpumask_t map; + /* + * Bind us to CPU 0 and stop any other VCPUs. + */ + thread_lock(curthread); + sched_bind(curthread, 0); + thread_unlock(curthread); + KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0")); + + map = PCPU_GET(other_cpus) & ~stopped_cpus; + if (map) + stop_cpus(map); +#endif + + if (DEVICE_SUSPEND(root_bus) != 0) { + printf("xen_suspend: device_suspend failed\n"); +#ifdef SMP + if (map) + restart_cpus(map); +#endif + return; + } + + local_irq_disable(); + + xencons_suspend(); + gnttab_suspend(); + + max_pfn = HYPERVISOR_shared_info->arch.max_pfn; + + void *shared_info = HYPERVISOR_shared_info; + HYPERVISOR_shared_info = NULL; + pmap_kremove((vm_offset_t) shared_info); + PT_UPDATES_FLUSH(); + + xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + start_info_mfn = VTOMFN(xen_start_info); + pmap_suspend(); + HYPERVISOR_suspend(start_info_mfn); + pmap_resume(); + + pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info); + HYPERVISOR_shared_info = shared_info; + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + VTOMFN(xen_pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + xen_pfn_to_mfn_frame_list_list[k] = + VTOMFN(xen_pfn_to_mfn_frame_list[k]); + j = 0; + } + xen_pfn_to_mfn_frame_list[k][j] = + VTOMFN(&xen_phys_machine[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + + gnttab_resume(); + irq_resume(); + local_irq_enable(); + xencons_resume(); + +#ifdef CONFIG_SMP + for_each_cpu(i) + vcpu_prepare(i); + +#endif + /* + * Only resume xenbus /after/ we've prepared our VCPUs; otherwise + * the VCPU hotplug callback can race with our vcpu_prepare + */ + DEVICE_RESUME(root_bus); + +#ifdef SMP + thread_lock(curthread); + sched_unbind(curthread); + thread_unlock(curthread); + if (map) + restart_cpus(map); +#endif +} + +static void +xen_pv_shutdown_final(void *arg, int howto) +{ + /* + * Inform the hypervisor that shutdown is complete. + * This is not necessary in HVM domains since Xen + * emulates ACPI in that mode and FreeBSD's ACPI + * support will request this transition. + */ + if (howto & (RB_HALT | RB_POWEROFF)) + HYPERVISOR_shutdown(SHUTDOWN_poweroff); + else + HYPERVISOR_shutdown(SHUTDOWN_reboot); +} + +#else +extern void xenpci_resume(void); + +/* HVM mode suspension. */ +static void +xctrl_suspend() +{ + int suspend_cancelled; + + if (DEVICE_SUSPEND(root_bus)) { + printf("xen_suspend: device_suspend failed\n"); + return; + } + + /* + * Make sure we don't change cpus or switch to some other + * thread. for the duration. + */ + critical_enter(); + + /* + * Prevent any races with evtchn_interrupt() handler. + */ + irq_suspend(); + disable_intr(); + + suspend_cancelled = HYPERVISOR_suspend(0); + if (!suspend_cancelled) + xenpci_resume(); + + /* + * Re-enable interrupts and put the scheduler back to normal. + */ + enable_intr(); + critical_exit(); + + /* + * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or + * similar. + */ + if (!suspend_cancelled) + DEVICE_RESUME(root_bus); +} +#endif + +static void +xctrl_crash() +{ + panic("Xen directed crash"); +} + +static void +xctrl_halt() +{ + shutdown_nice(RB_HALT); +} + +/*------------------------------ Event Reception -----------------------------*/ +static void +xctrl_on_watch_event(struct xs_watch *watch, const char **vec, unsigned int len) +{ + struct xctrl_shutdown_reason *reason; + struct xctrl_shutdown_reason *last_reason; + char *result; + int error; + int result_len; + + error = xs_read(XST_NIL, "control", "shutdown", + &result_len, (void **)&result); + if (error != 0) + return; + + reason = xctrl_shutdown_reasons; + last_reason = reason + NUM_ELEMENTS(xctrl_shutdown_reasons); + while (reason < last_reason) { + + if (!strcmp(result, reason->name)) { + reason->handler(); + break; + } + reason++; + } + + free(result, M_XENSTORE); +} + +/*------------------ Private Device Attachment Functions --------------------*/ +/** + * \brief Identify instances of this device type in the system. + * + * \param driver The driver performing this identify action. + * \param parent The NewBus parent device for any devices this method adds. + */ +static void +xctrl_identify(driver_t *driver __unused, device_t parent) +{ + /* + * A single device instance for our driver is always present + * in a system operating under Xen. + */ + BUS_ADD_CHILD(parent, 0, driver->name, 0); +} + +/** + * \brief Probe for the existance of the Xen Control device + * + * \param dev NewBus device_t for this Xen control instance. + * + * \return Always returns 0 indicating success. + */ +static int +xctrl_probe(device_t dev) +{ + device_set_desc(dev, "Xen Control Device"); + + return (0); +} + +/** + * \brief Attach the Xen control device. + * + * \param dev NewBus device_t for this Xen control instance. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xctrl_attach(device_t dev) +{ + struct xctrl_softc *xctrl; + + xctrl = device_get_softc(dev); + + /* Activate watch */ + xctrl->xctrl_watch.node = "control/shutdown"; + xctrl->xctrl_watch.callback = xctrl_on_watch_event; + xs_register_watch(&xctrl->xctrl_watch); + +#ifndef XENHVM + EVENTHANDLER_REGISTER(shutdown_final, xen_pv_shutdown_final, NULL, + SHUTDOWN_PRI_LAST); +#endif + + return (0); +} + +/** + * \brief Detach the Xen control device. + * + * \param dev NewBus device_t for this Xen control device instance. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xctrl_detach(device_t dev) +{ + struct xctrl_softc *xctrl; + + xctrl = device_get_softc(dev); + + /* Release watch */ + xs_unregister_watch(&xctrl->xctrl_watch); + + return (0); +} + +/*-------------------- Private Device Attachment Data -----------------------*/ +static device_method_t xctrl_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xctrl_identify), + DEVMETHOD(device_probe, xctrl_probe), + DEVMETHOD(device_attach, xctrl_attach), + DEVMETHOD(device_detach, xctrl_detach), + + { 0, 0 } +}; + +DEFINE_CLASS_0(xctrl, xctrl_driver, xctrl_methods, sizeof(struct xctrl_softc)); +devclass_t xctrl_devclass; + +DRIVER_MODULE(xctrl, xenstore, xctrl_driver, xctrl_devclass, 0, 0); diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index a6fd9ea..423df97 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -91,8 +91,6 @@ __FBSDID("$FreeBSD$"); #define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) -#define GRANT_INVALID_REF 0 - #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) @@ -373,7 +371,8 @@ xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri) { int i = xennet_rxidx(ri); grant_ref_t ref = np->grant_rx_ref[i]; - np->grant_rx_ref[i] = GRANT_INVALID_REF; + KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n")); + np->grant_rx_ref[i] = GRANT_REF_INVALID; return ref; } @@ -404,7 +403,7 @@ xen_net_read_mac(device_t dev, uint8_t mac[]) int error, i; char *s, *e, *macstr; - error = xenbus_read(XBT_NIL, xenbus_get_node(dev), "mac", NULL, + error = xs_read(XST_NIL, xenbus_get_node(dev), "mac", NULL, (void **) &macstr); if (error) return (error); @@ -413,12 +412,12 @@ xen_net_read_mac(device_t dev, uint8_t mac[]) for (i = 0; i < ETHER_ADDR_LEN; i++) { mac[i] = strtoul(s, &e, 16); if (s == e || (e[0] != ':' && e[0] != 0)) { - free(macstr, M_DEVBUF); + free(macstr, M_XENBUS); return (ENOENT); } s = &e[1]; } - free(macstr, M_DEVBUF); + free(macstr, M_XENBUS); return (0); } @@ -483,7 +482,7 @@ static int talk_to_backend(device_t dev, struct netfront_info *info) { const char *message; - struct xenbus_transaction xbt; + struct xs_transaction xst; const char *node = xenbus_get_node(dev); int err; @@ -499,54 +498,54 @@ talk_to_backend(device_t dev, struct netfront_info *info) goto out; again: - err = xenbus_transaction_start(&xbt); + err = xs_transaction_start(&xst); if (err) { xenbus_dev_fatal(dev, err, "starting transaction"); goto destroy_ring; } - err = xenbus_printf(xbt, node, "tx-ring-ref","%u", + err = xs_printf(xst, node, "tx-ring-ref","%u", info->tx_ring_ref); if (err) { message = "writing tx ring-ref"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "rx-ring-ref","%u", + err = xs_printf(xst, node, "rx-ring-ref","%u", info->rx_ring_ref); if (err) { message = "writing rx ring-ref"; goto abort_transaction; } - err = xenbus_printf(xbt, node, + err = xs_printf(xst, node, "event-channel", "%u", irq_to_evtchn_port(info->irq)); if (err) { message = "writing event-channel"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "request-rx-copy", "%u", + err = xs_printf(xst, node, "request-rx-copy", "%u", info->copying_receiver); if (err) { message = "writing request-rx-copy"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "feature-rx-notify", "%d", 1); + err = xs_printf(xst, node, "feature-rx-notify", "%d", 1); if (err) { message = "writing feature-rx-notify"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "feature-sg", "%d", 1); + err = xs_printf(xst, node, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } #if __FreeBSD_version >= 700000 - err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1); + err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; goto abort_transaction; } #endif - err = xenbus_transaction_end(xbt, 0); + err = xs_transaction_end(xst, 0); if (err) { if (err == EAGAIN) goto again; @@ -557,7 +556,7 @@ talk_to_backend(device_t dev, struct netfront_info *info) return 0; abort_transaction: - xenbus_transaction_end(xbt, 1); + xs_transaction_end(xst, 1); xenbus_dev_fatal(dev, err, "%s", message); destroy_ring: netif_free(info); @@ -576,8 +575,8 @@ setup_device(device_t dev, struct netfront_info *info) ifp = info->xn_ifp; - info->tx_ring_ref = GRANT_INVALID_REF; - info->rx_ring_ref = GRANT_INVALID_REF; + info->tx_ring_ref = GRANT_REF_INVALID; + info->rx_ring_ref = GRANT_REF_INVALID; info->rx.sring = NULL; info->tx.sring = NULL; info->irq = 0; @@ -750,7 +749,7 @@ netif_release_tx_bufs(struct netfront_info *np) GNTMAP_readonly); gnttab_release_grant_reference(&np->gref_tx_head, np->grant_tx_ref[i]); - np->grant_tx_ref[i] = GRANT_INVALID_REF; + np->grant_tx_ref[i] = GRANT_REF_INVALID; add_id_to_freelist(np->tx_mbufs, i); np->xn_cdata.xn_tx_chain_cnt--; if (np->xn_cdata.xn_tx_chain_cnt < 0) { @@ -854,7 +853,8 @@ refill: sc->rx_mbufs[id] = m_new; ref = gnttab_claim_grant_reference(&sc->gref_rx_head); - KASSERT((short)ref >= 0, ("negative ref")); + KASSERT(ref != GNTTAB_LIST_END, + ("reserved grant references exhuasted")); sc->grant_rx_ref[id] = ref; vaddr = mtod(m_new, vm_offset_t); @@ -1135,7 +1135,7 @@ xn_txeof(struct netfront_info *np) np->grant_tx_ref[id]); gnttab_release_grant_reference( &np->gref_tx_head, np->grant_tx_ref[id]); - np->grant_tx_ref[id] = GRANT_INVALID_REF; + np->grant_tx_ref[id] = GRANT_REF_INVALID; np->tx_mbufs[id] = NULL; add_id_to_freelist(np->tx_mbufs, id); @@ -1318,12 +1318,13 @@ xennet_get_responses(struct netfront_info *np, * the backend driver. In future this should flag the bad * situation to the system controller to reboot the backed. */ - if (ref == GRANT_INVALID_REF) { + if (ref == GRANT_REF_INVALID) { #if 0 if (net_ratelimit()) WPRINTK("Bad rx response id %d.\n", rx->id); #endif + printf("%s: Bad rx response id %d.\n", __func__,rx->id); err = EINVAL; goto next; } @@ -1384,7 +1385,7 @@ next_skip_queue: err = ENOENT; printf("%s: cons %u frags %u rp %u, not enough frags\n", __func__, *cons, frags, rp); - break; + break; } /* * Note that m can be NULL, if rx->status < 0 or if @@ -1526,6 +1527,11 @@ xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head) * tell the TCP stack to generate a shorter chain of packets. */ if (nfrags > MAX_TX_REQ_FRAGS) { +#ifdef DEBUG + printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback " + "won't be able to handle it, dropping\n", + __func__, nfrags, MAX_TX_REQ_FRAGS); +#endif m_freem(m_head); return (EMSGSIZE); } @@ -1881,11 +1887,11 @@ network_connect(struct netfront_info *np) netif_rx_request_t *req; u_int feature_rx_copy, feature_rx_flip; - error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev), + error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-rx-copy", NULL, "%u", &feature_rx_copy); if (error) feature_rx_copy = 0; - error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev), + error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-rx-flip", NULL, "%u", &feature_rx_flip); if (error) feature_rx_flip = 1; @@ -1999,14 +2005,14 @@ create_netdev(device_t dev) /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ for (i = 0; i <= NET_TX_RING_SIZE; i++) { np->tx_mbufs[i] = (void *) ((u_long) i+1); - np->grant_tx_ref[i] = GRANT_INVALID_REF; + np->grant_tx_ref[i] = GRANT_REF_INVALID; } np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0; for (i = 0; i <= NET_RX_RING_SIZE; i++) { np->rx_mbufs[i] = NULL; - np->grant_rx_ref[i] = GRANT_INVALID_REF; + np->grant_rx_ref[i] = GRANT_REF_INVALID; } /* A grant for every tx ring slot */ if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, @@ -2128,8 +2134,8 @@ netif_disconnect_backend(struct netfront_info *info) end_access(info->tx_ring_ref, info->tx.sring); end_access(info->rx_ring_ref, info->rx.sring); - info->tx_ring_ref = GRANT_INVALID_REF; - info->rx_ring_ref = GRANT_INVALID_REF; + info->tx_ring_ref = GRANT_REF_INVALID; + info->rx_ring_ref = GRANT_REF_INVALID; info->tx.sring = NULL; info->rx.sring = NULL; @@ -2143,7 +2149,7 @@ netif_disconnect_backend(struct netfront_info *info) static void end_access(int ref, void *page) { - if (ref != GRANT_INVALID_REF) + if (ref != GRANT_REF_INVALID) gnttab_end_foreign_access(ref, page); } @@ -2171,7 +2177,7 @@ static device_method_t netfront_methods[] = { DEVMETHOD(device_resume, netfront_resume), /* Xenbus interface */ - DEVMETHOD(xenbus_backend_changed, netfront_backend_changed), + DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed), { 0, 0 } }; @@ -2183,4 +2189,4 @@ static driver_t netfront_driver = { }; devclass_t netfront_devclass; -DRIVER_MODULE(xe, xenbus, netfront_driver, netfront_devclass, 0, 0); +DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, 0, 0); diff --git a/sys/dev/xen/xenpci/evtchn.c b/sys/dev/xen/xenpci/evtchn.c index bdf3ad1..ea53a7e 100644 --- a/sys/dev/xen/xenpci/evtchn.c +++ b/sys/dev/xen/xenpci/evtchn.c @@ -181,6 +181,49 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain, return (0); } +int +bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, const char *devname, driver_intr_t handler, + void *arg, unsigned long irqflags, unsigned int *irqp) +{ + struct evtchn_bind_interdomain bind_interdomain; + unsigned int irq; + int error; + + irq = alloc_xen_irq(); + if (irq < 0) + return irq; + + mtx_lock(&irq_evtchn[irq].lock); + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (error) { + mtx_unlock(&irq_evtchn[irq].lock); + free_xen_irq(irq); + return (-error); + } + + irq_evtchn[irq].handler = handler; + irq_evtchn[irq].arg = arg; + irq_evtchn[irq].evtchn = bind_interdomain.local_port; + irq_evtchn[irq].close = 1; + irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0; + + evtchn_to_irq[bind_interdomain.local_port] = irq; + + unmask_evtchn(bind_interdomain.local_port); + + mtx_unlock(&irq_evtchn[irq].lock); + + if (irqp) + *irqp = irq; + return (0); +} + + int bind_caller_port_to_irqhandler(unsigned int caller_port, const char *devname, driver_intr_t handler, void *arg, diff --git a/sys/dev/xen/xenpci/xenpci.c b/sys/dev/xen/xenpci/xenpci.c index 2f2a79f..f4c9f73 100644 --- a/sys/dev/xen/xenpci/xenpci.c +++ b/sys/dev/xen/xenpci/xenpci.c @@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$"); char *hypercall_stubs; shared_info_t *HYPERVISOR_shared_info; static vm_paddr_t shared_info_pa; +static device_t nexus; /* * This is used to find our platform device instance. @@ -80,7 +81,7 @@ xenpci_cpuid_base(void) { uint32_t base, regs[4]; - for (base = 0x40000000; base < 0x40001000; base += 0x100) { + for (base = 0x40000000; base < 0x40010000; base += 0x100) { do_cpuid(base, regs); if (!memcmp("XenVMMXenVMM", ®s[1], 12) && (regs[0] - base) >= 2) @@ -204,14 +205,21 @@ xenpci_allocate_resources(device_t dev) scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &scp->rid_irq, RF_SHAREABLE|RF_ACTIVE); - if (scp->res_irq == NULL) + if (scp->res_irq == NULL) { + printf("xenpci Could not allocate irq.\n"); goto errexit; + } scp->rid_memory = PCIR_BAR(1); scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &scp->rid_memory, RF_ACTIVE); - if (scp->res_memory == NULL) + if (scp->res_memory == NULL) { + printf("xenpci Could not allocate memory bar.\n"); goto errexit; + } + + scp->phys_next = rman_get_start(scp->res_memory); + return (0); errexit: @@ -254,6 +262,36 @@ xenpci_alloc_space(size_t sz, vm_paddr_t *pa) } } +static struct resource * +xenpci_alloc_resource(device_t dev, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + return (BUS_ALLOC_RESOURCE(nexus, child, type, rid, start, + end, count, flags)); +} + + +static int +xenpci_release_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + return (BUS_RELEASE_RESOURCE(nexus, child, type, rid, r)); +} + +static int +xenpci_activate_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + return (BUS_ACTIVATE_RESOURCE(nexus, child, type, rid, r)); +} + +static int +xenpci_deactivate_resource(device_t dev, device_t child, int type, + int rid, struct resource *r) +{ + return (BUS_DEACTIVATE_RESOURCE(nexus, child, type, rid, r)); +} + /* * Called very early in the resume sequence - reinitialise the various * bits of Xen machinery including the hypercall page and the shared @@ -303,20 +341,36 @@ xenpci_probe(device_t dev) static int xenpci_attach(device_t dev) { - int error; + int error; struct xenpci_softc *scp = device_get_softc(dev); struct xen_add_to_physmap xatp; vm_offset_t shared_va; + devclass_t dc; + + /* + * Find and record nexus0. Since we are not really on the + * PCI bus, all resource operations are directed to nexus + * instead of through our parent. + */ + if ((dc = devclass_find("nexus")) == 0 + || (nexus = devclass_get_device(dc, 0)) == 0) { + device_printf(dev, "unable to find nexus."); + return (ENOENT); + } error = xenpci_allocate_resources(dev); - if (error) + if (error) { + device_printf(dev, "xenpci_allocate_resources failed(%d).\n", + error); goto errexit; - - scp->phys_next = rman_get_start(scp->res_memory); + } error = xenpci_init_hypercall_stubs(dev, scp); - if (error) + if (error) { + device_printf(dev, "xenpci_init_hypercall_stubs failed(%d).\n", + error); goto errexit; + } setup_xen_features(); @@ -346,7 +400,7 @@ errexit: * Undo anything we may have done. */ xenpci_deallocate_resources(dev); - return (error); + return (error); } /* @@ -364,8 +418,9 @@ xenpci_detach(device_t dev) */ if (scp->intr_cookie != NULL) { if (BUS_TEARDOWN_INTR(parent, dev, - scp->res_irq, scp->intr_cookie) != 0) - printf("intr teardown failed.. continuing\n"); + scp->res_irq, scp->intr_cookie) != 0) + device_printf(dev, + "intr teardown failed.. continuing\n"); scp->intr_cookie = NULL; } @@ -386,6 +441,10 @@ static device_method_t xenpci_methods[] = { /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_alloc_resource, xenpci_alloc_resource), + DEVMETHOD(bus_release_resource, xenpci_release_resource), + DEVMETHOD(bus_activate_resource, xenpci_activate_resource), + DEVMETHOD(bus_deactivate_resource, xenpci_deactivate_resource), { 0, 0 } }; diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 5e50d3e..0d35d1d 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -1933,7 +1933,15 @@ again: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - r = VOP_LOOKUP(vp, &nvp, &cn); + if ((vp->v_vflag & VV_ROOT) != 0 + && (cn.cn_flags & ISDOTDOT) + != 0) { + vref(vp); + nvp = vp; + r = 0; + } else + r = VOP_LOOKUP(vp, &nvp, + &cn); } } if (!r) { diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c index 05402ce..3b9ffdd81b 100644 --- a/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -106,7 +106,7 @@ struct g_class g_eli_class = { /* * Code paths: * BIO_READ: - * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver + * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ @@ -148,7 +148,7 @@ g_eli_crypto_rerun(struct cryptop *crp) /* * The function is called afer reading encrypted data from the provider. * - * g_eli_start -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver + * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_read_done(struct bio *bp) @@ -167,6 +167,7 @@ g_eli_read_done(struct bio *bp) if (pbp->bio_inbed < pbp->bio_children) return; g_destroy_bio(bp); + sc = pbp->bio_to->geom->softc; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed", __func__); pbp->bio_completed = 0; @@ -175,9 +176,9 @@ g_eli_read_done(struct bio *bp) pbp->bio_driver2 = NULL; } g_io_deliver(pbp, pbp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); return; } - sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); @@ -192,6 +193,7 @@ g_eli_read_done(struct bio *bp) void g_eli_write_done(struct bio *bp) { + struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); @@ -218,7 +220,9 @@ g_eli_write_done(struct bio *bp) * Write is finished, send it up. */ pbp->bio_completed = pbp->bio_length; + sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); } /* @@ -241,12 +245,14 @@ g_eli_orphan(struct g_consumer *cp) sc = cp->geom->softc; if (sc == NULL) return; - g_eli_destroy(sc, 1); + g_eli_destroy(sc, TRUE); } /* - * BIO_READ : G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver - * BIO_WRITE: G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver + * BIO_READ: + * G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver + * BIO_WRITE: + * G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ static void g_eli_start(struct bio *bp) @@ -282,24 +288,16 @@ g_eli_start(struct bio *bp) g_io_deliver(bp, ENOMEM); return; } + bp->bio_driver1 = cbp; + bp->bio_pflags = G_ELI_NEW_BIO; switch (bp->bio_cmd) { case BIO_READ: if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) { - bp->bio_driver2 = NULL; - cbp->bio_done = g_eli_read_done; - cp = LIST_FIRST(&sc->sc_geom->consumer); - cbp->bio_to = cp->provider; - G_ELI_LOGREQ(2, cbp, "Sending request."); - /* - * Read encrypted data from provider. - */ - g_io_request(cbp, cp); + g_eli_crypto_read(sc, bp, 0); break; } - bp->bio_pflags = 255; /* FALLTHROUGH */ case BIO_WRITE: - bp->bio_driver1 = cbp; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); @@ -316,6 +314,104 @@ g_eli_start(struct bio *bp) } } +static int +g_eli_newsession(struct g_eli_worker *wr) +{ + struct g_eli_softc *sc; + struct cryptoini crie, cria; + int error; + + sc = wr->w_softc; + + bzero(&crie, sizeof(crie)); + crie.cri_alg = sc->sc_ealgo; + crie.cri_klen = sc->sc_ekeylen; + if (sc->sc_ealgo == CRYPTO_AES_XTS) + crie.cri_klen <<= 1; + crie.cri_key = sc->sc_ekeys[0]; + if (sc->sc_flags & G_ELI_FLAG_AUTH) { + bzero(&cria, sizeof(cria)); + cria.cri_alg = sc->sc_aalgo; + cria.cri_klen = sc->sc_akeylen; + cria.cri_key = sc->sc_akey; + crie.cri_next = &cria; + } + + switch (sc->sc_crypto) { + case G_ELI_CRYPTO_SW: + error = crypto_newsession(&wr->w_sid, &crie, + CRYPTOCAP_F_SOFTWARE); + break; + case G_ELI_CRYPTO_HW: + error = crypto_newsession(&wr->w_sid, &crie, + CRYPTOCAP_F_HARDWARE); + break; + case G_ELI_CRYPTO_UNKNOWN: + error = crypto_newsession(&wr->w_sid, &crie, + CRYPTOCAP_F_HARDWARE); + if (error == 0) { + mtx_lock(&sc->sc_queue_mtx); + if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) + sc->sc_crypto = G_ELI_CRYPTO_HW; + mtx_unlock(&sc->sc_queue_mtx); + } else { + error = crypto_newsession(&wr->w_sid, &crie, + CRYPTOCAP_F_SOFTWARE); + mtx_lock(&sc->sc_queue_mtx); + if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) + sc->sc_crypto = G_ELI_CRYPTO_SW; + mtx_unlock(&sc->sc_queue_mtx); + } + break; + default: + panic("%s: invalid condition", __func__); + } + + return (error); +} + +static void +g_eli_freesession(struct g_eli_worker *wr) +{ + + crypto_freesession(wr->w_sid); +} + +static void +g_eli_cancel(struct g_eli_softc *sc) +{ + struct bio *bp; + + mtx_assert(&sc->sc_queue_mtx, MA_OWNED); + + while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { + KASSERT(bp->bio_pflags == G_ELI_NEW_BIO, + ("Not new bio when canceling (bp=%p).", bp)); + g_io_deliver(bp, ENXIO); + } +} + +static struct bio * +g_eli_takefirst(struct g_eli_softc *sc) +{ + struct bio *bp; + + mtx_assert(&sc->sc_queue_mtx, MA_OWNED); + + if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) + return (bioq_takefirst(&sc->sc_queue)); + /* + * Device suspended, so we skip new I/O requests. + */ + TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { + if (bp->bio_pflags != G_ELI_NEW_BIO) + break; + } + if (bp != NULL) + bioq_remove(&sc->sc_queue, bp); + return (bp); +} + /* * This is the main function for kernel worker thread when we don't have * hardware acceleration and we have to do cryptography in software. @@ -328,6 +424,7 @@ g_eli_worker(void *arg) struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; + int error; wr = arg; sc = wr->w_softc; @@ -349,11 +446,13 @@ g_eli_worker(void *arg) for (;;) { mtx_lock(&sc->sc_queue_mtx); - bp = bioq_takefirst(&sc->sc_queue); +again: + bp = g_eli_takefirst(sc); if (bp == NULL) { if (sc->sc_flags & G_ELI_FLAG_DESTROY) { + g_eli_cancel(sc); LIST_REMOVE(wr, w_next); - crypto_freesession(wr->w_sid); + g_eli_freesession(wr); free(wr, M_ELI); G_ELI_DEBUG(1, "Thread %s exiting.", curthread->td_proc->p_comm); @@ -361,16 +460,63 @@ g_eli_worker(void *arg) mtx_unlock(&sc->sc_queue_mtx); kproc_exit(0); } + while (sc->sc_flags & G_ELI_FLAG_SUSPEND) { + if (sc->sc_inflight > 0) { + G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight); + /* + * We still have inflight BIOs, so + * sleep and retry. + */ + msleep(sc, &sc->sc_queue_mtx, PRIBIO, + "geli:inf", hz / 5); + goto again; + } + /* + * Suspend requested, mark the worker as + * suspended and go to sleep. + */ + if (wr->w_active) { + g_eli_freesession(wr); + wr->w_active = FALSE; + } + wakeup(&sc->sc_workers); + msleep(sc, &sc->sc_queue_mtx, PRIBIO, + "geli:suspend", 0); + if (!wr->w_active && + !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) { + error = g_eli_newsession(wr); + KASSERT(error == 0, + ("g_eli_newsession() failed on resume (error=%d)", + error)); + wr->w_active = TRUE; + } + goto again; + } msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0); continue; } + if (bp->bio_pflags == G_ELI_NEW_BIO) + atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); - if (bp->bio_cmd == BIO_READ && bp->bio_pflags == 255) - g_eli_auth_read(sc, bp); - else if (sc->sc_flags & G_ELI_FLAG_AUTH) - g_eli_auth_run(wr, bp); - else - g_eli_crypto_run(wr, bp); + if (bp->bio_pflags == G_ELI_NEW_BIO) { + bp->bio_pflags = 0; + if (sc->sc_flags & G_ELI_FLAG_AUTH) { + if (bp->bio_cmd == BIO_READ) + g_eli_auth_read(sc, bp); + else + g_eli_auth_run(wr, bp); + } else { + if (bp->bio_cmd == BIO_READ) + g_eli_crypto_read(sc, bp, 1); + else + g_eli_crypto_run(wr, bp); + } + } else { + if (sc->sc_flags & G_ELI_FLAG_AUTH) + g_eli_auth_run(wr, bp); + else + g_eli_crypto_run(wr, bp); + } } } @@ -500,7 +646,7 @@ g_eli_last_close(struct g_eli_softc *sc) gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); strlcpy(ppname, pp->name, sizeof(ppname)); - error = g_eli_destroy(sc, 1); + error = g_eli_destroy(sc, TRUE); KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).", ppname, error)); G_ELI_DEBUG(0, "Detached %s on last close.", ppname); @@ -557,7 +703,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; - struct cryptoini crie, cria; u_int i, threads; int error; @@ -584,7 +729,8 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, else gp->access = g_std_access; - sc->sc_crypto = G_ELI_CRYPTO_SW; + sc->sc_inflight = 0; + sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN; sc->sc_flags = md->md_flags; /* Backward compatibility. */ if (md->md_version < 4) @@ -612,14 +758,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, sc->sc_bytes_per_sector = (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1; sc->sc_bytes_per_sector *= bpp->sectorsize; - /* - * Precalculate SHA256 for HMAC key generation. - * This is expensive operation and we can do it only once now or - * for every access to sector, so now will be much better. - */ - SHA256_Init(&sc->sc_akeyctx); - SHA256_Update(&sc->sc_akeyctx, sc->sc_akey, - sizeof(sc->sc_akey)); } gp->softc = sc; @@ -679,7 +817,16 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, */ g_eli_mkey_propagate(sc, mkey); sc->sc_ekeylen = md->md_keylen; - + if (sc->sc_flags & G_ELI_FLAG_AUTH) { + /* + * Precalculate SHA256 for HMAC key generation. + * This is expensive operation and we can do it only once now or + * for every access to sector, so now will be much better. + */ + SHA256_Init(&sc->sc_akeyctx); + SHA256_Update(&sc->sc_akeyctx, sc->sc_akey, + sizeof(sc->sc_akey)); + } /* * Precalculate SHA256 for IV generation. * This is expensive operation and we can do it only once now or for @@ -697,20 +844,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, LIST_INIT(&sc->sc_workers); - bzero(&crie, sizeof(crie)); - crie.cri_alg = sc->sc_ealgo; - crie.cri_klen = sc->sc_ekeylen; - if (sc->sc_ealgo == CRYPTO_AES_XTS) - crie.cri_klen <<= 1; - crie.cri_key = sc->sc_ekeys[0]; - if (sc->sc_flags & G_ELI_FLAG_AUTH) { - bzero(&cria, sizeof(cria)); - cria.cri_alg = sc->sc_aalgo; - cria.cri_klen = sc->sc_akeylen; - cria.cri_key = sc->sc_akey; - crie.cri_next = &cria; - } - threads = g_eli_threads; if (threads == 0) threads = mp_ncpus; @@ -728,21 +861,9 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO); wr->w_softc = sc; wr->w_number = i; + wr->w_active = TRUE; - /* - * If this is the first pass, try to get hardware support. - * Use software cryptography, if we cannot get it. - */ - if (LIST_EMPTY(&sc->sc_workers)) { - error = crypto_newsession(&wr->w_sid, &crie, - CRYPTOCAP_F_HARDWARE); - if (error == 0) - sc->sc_crypto = G_ELI_CRYPTO_HW; - } - if (sc->sc_crypto == G_ELI_CRYPTO_SW) { - error = crypto_newsession(&wr->w_sid, &crie, - CRYPTOCAP_F_SOFTWARE); - } + error = g_eli_newsession(wr); if (error != 0) { free(wr, M_ELI); if (req != NULL) { @@ -758,7 +879,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0, "g_eli[%u] %s", i, bpp->name); if (error != 0) { - crypto_freesession(wr->w_sid); + g_eli_freesession(wr); free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot create kernel thread " @@ -875,7 +996,7 @@ g_eli_destroy_geom(struct gctl_req *req __unused, struct g_eli_softc *sc; sc = gp->softc; - return (g_eli_destroy(sc, 0)); + return (g_eli_destroy(sc, FALSE)); } static int @@ -1106,6 +1227,7 @@ g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, sbuf_printf(sb, name); \ } \ } while (0) + ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND"); ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY"); ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER"); ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME"); @@ -1167,7 +1289,7 @@ g_eli_shutdown_pre_sync(void *arg, int howto) pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name)); if (pp->acr + pp->acw + pp->ace == 0) - error = g_eli_destroy(sc, 1); + error = g_eli_destroy(sc, TRUE); else { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h index e6b311d..d67e436 100644 --- a/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -86,6 +86,10 @@ #define G_ELI_FLAG_NATIVE_BYTE_ORDER 0x00040000 /* Provider uses single encryption key. */ #define G_ELI_FLAG_SINGLE_KEY 0x00080000 +/* Device suspended. */ +#define G_ELI_FLAG_SUSPEND 0x00100000 + +#define G_ELI_NEW_BIO 255 #define SHA512_MDLEN 64 #define G_ELI_AUTH_SECKEYLEN SHA256_DIGEST_LENGTH @@ -109,6 +113,7 @@ extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; +#define G_ELI_CRYPTO_UNKNOWN 0 #define G_ELI_CRYPTO_HW 1 #define G_ELI_CRYPTO_SW 2 @@ -140,6 +145,7 @@ struct g_eli_worker { struct proc *w_proc; u_int w_number; uint64_t w_sid; + boolean_t w_active; LIST_ENTRY(g_eli_worker) w_next; }; @@ -160,6 +166,7 @@ struct g_eli_softc { SHA256_CTX sc_ivctx; int sc_nkey; uint32_t sc_flags; + int sc_inflight; off_t sc_mediasize; size_t sc_sectorsize; u_int sc_bytes_per_sector; @@ -499,6 +506,7 @@ uint8_t *g_eli_crypto_key(struct g_eli_softc *sc, off_t offset, void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, size_t size); +void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker); void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp); diff --git a/sys/geom/eli/g_eli_ctl.c b/sys/geom/eli/g_eli_ctl.c index 02ede13..7147b27 100644 --- a/sys/geom/eli/g_eli_ctl.c +++ b/sys/geom/eli/g_eli_ctl.c @@ -217,7 +217,7 @@ g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp) sc->sc_flags |= G_ELI_FLAG_RW_DETACH; sc->sc_geom->access = g_eli_access; } else { - error = g_eli_destroy(sc, *force); + error = g_eli_destroy(sc, *force ? TRUE : FALSE); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", @@ -700,6 +700,213 @@ g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp) } static int +g_eli_suspend_one(struct g_eli_softc *sc) +{ + struct g_eli_worker *wr; + + g_topology_assert(); + + if (sc == NULL) + return (ENOENT); + if (sc->sc_flags & G_ELI_FLAG_ONETIME) + return (EOPNOTSUPP); + + mtx_lock(&sc->sc_queue_mtx); + if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { + mtx_unlock(&sc->sc_queue_mtx); + return (EALREADY); + } + sc->sc_flags |= G_ELI_FLAG_SUSPEND; + wakeup(sc); + for (;;) { + LIST_FOREACH(wr, &sc->sc_workers, w_next) { + if (wr->w_active) + break; + } + if (wr == NULL) + break; + /* Not all threads suspended. */ + msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, + "geli:suspend", 0); + } + /* + * Clear sensitive data on suspend, they will be recovered on resume. + */ + bzero(sc->sc_mkey, sizeof(sc->sc_mkey)); + bzero(sc->sc_ekeys, + sc->sc_nekeys * (sizeof(uint8_t *) + G_ELI_DATAKEYLEN)); + free(sc->sc_ekeys, M_ELI); + sc->sc_ekeys = NULL; + bzero(sc->sc_akey, sizeof(sc->sc_akey)); + bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx)); + bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey)); + bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx)); + mtx_unlock(&sc->sc_queue_mtx); + G_ELI_DEBUG(0, "%s has been suspended.", sc->sc_name); + return (0); +} + +static void +g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp) +{ + struct g_eli_softc *sc; + int *all, *nargs; + int error; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + all = gctl_get_paraml(req, "all", sizeof(*all)); + if (all == NULL) { + gctl_error(req, "No '%s' argument.", "all"); + return; + } + if (!*all && *nargs == 0) { + gctl_error(req, "Too few arguments."); + return; + } + + if (*all) { + struct g_geom *gp, *gp2; + + LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { + sc = gp->softc; + if (sc->sc_flags & G_ELI_FLAG_ONETIME) + continue; + error = g_eli_suspend_one(sc); + if (error != 0) + gctl_error(req, "Not fully done."); + } + } else { + const char *prov; + char param[16]; + int i; + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%d", i); + prov = gctl_get_asciiparam(req, param); + if (prov == NULL) { + G_ELI_DEBUG(0, "No 'arg%d' argument.", i); + continue; + } + + sc = g_eli_find_device(mp, prov); + if (sc == NULL) { + G_ELI_DEBUG(0, "No such provider: %s.", prov); + continue; + } + error = g_eli_suspend_one(sc); + if (error != 0) + gctl_error(req, "Not fully done."); + } + } +} + +static void +g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp) +{ + struct g_eli_metadata md; + struct g_eli_softc *sc; + struct g_provider *pp; + struct g_consumer *cp; + const char *name; + u_char *key, mkey[G_ELI_DATAIVKEYLEN]; + int *nargs, keysize, error; + u_int nkey; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs != 1) { + gctl_error(req, "Invalid number of arguments."); + return; + } + + name = gctl_get_asciiparam(req, "arg0"); + if (name == NULL) { + gctl_error(req, "No 'arg%u' argument.", 0); + return; + } + sc = g_eli_find_device(mp, name); + if (sc == NULL) { + gctl_error(req, "Provider %s is invalid.", name); + return; + } + if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) { + gctl_error(req, "Provider %s not suspended.", name); + return; + } + cp = LIST_FIRST(&sc->sc_geom->consumer); + pp = cp->provider; + error = g_eli_read_metadata(mp, pp, &md); + if (error != 0) { + gctl_error(req, "Cannot read metadata from %s (error=%d).", + name, error); + return; + } + if (md.md_keys == 0x00) { + bzero(&md, sizeof(md)); + gctl_error(req, "No valid keys on %s.", pp->name); + return; + } + + key = gctl_get_param(req, "key", &keysize); + if (key == NULL || keysize != G_ELI_USERKEYLEN) { + bzero(&md, sizeof(md)); + gctl_error(req, "No '%s' argument.", "key"); + return; + } + + error = g_eli_mkey_decrypt(&md, key, mkey, &nkey); + bzero(key, keysize); + if (error == -1) { + bzero(&md, sizeof(md)); + gctl_error(req, "Wrong key for %s.", pp->name); + return; + } else if (error > 0) { + bzero(&md, sizeof(md)); + gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", + pp->name, error); + return; + } + G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); + + mtx_lock(&sc->sc_queue_mtx); + /* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */ + g_eli_mkey_propagate(sc, mkey); + bzero(mkey, sizeof(mkey)); + bzero(&md, sizeof(md)); + /* Restore sc_akeyctx. */ + if (sc->sc_flags & G_ELI_FLAG_AUTH) { + SHA256_Init(&sc->sc_akeyctx); + SHA256_Update(&sc->sc_akeyctx, sc->sc_akey, + sizeof(sc->sc_akey)); + } + /* Restore sc_ivctx. */ + switch (sc->sc_ealgo) { + case CRYPTO_AES_XTS: + break; + default: + SHA256_Init(&sc->sc_ivctx); + SHA256_Update(&sc->sc_ivctx, sc->sc_ivkey, + sizeof(sc->sc_ivkey)); + break; + } + sc->sc_flags &= ~G_ELI_FLAG_SUSPEND; + mtx_unlock(&sc->sc_queue_mtx); + G_ELI_DEBUG(1, "Resumed %s.", pp->name); + wakeup(sc); +} + +static int g_eli_kill_one(struct g_eli_softc *sc) { struct g_provider *pp; @@ -749,7 +956,7 @@ g_eli_kill_one(struct g_eli_softc *sc) } if (error == 0) G_ELI_DEBUG(0, "%s has been killed.", pp->name); - g_eli_destroy(sc, 1); + g_eli_destroy(sc, TRUE); return (error); } @@ -839,6 +1046,10 @@ g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb) g_eli_ctl_setkey(req, mp); else if (strcmp(verb, "delkey") == 0) g_eli_ctl_delkey(req, mp); + else if (strcmp(verb, "suspend") == 0) + g_eli_ctl_suspend(req, mp); + else if (strcmp(verb, "resume") == 0) + g_eli_ctl_resume(req, mp); else if (strcmp(verb, "kill") == 0) g_eli_ctl_kill(req, mp); else diff --git a/sys/geom/eli/g_eli_integrity.c b/sys/geom/eli/g_eli_integrity.c index bafce96..24586bd 100644 --- a/sys/geom/eli/g_eli_integrity.c +++ b/sys/geom/eli/g_eli_integrity.c @@ -129,6 +129,7 @@ g_eli_auth_keygen(struct g_eli_softc *sc, off_t offset, u_char *key) static int g_eli_auth_read_done(struct cryptop *crp) { + struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { @@ -152,8 +153,8 @@ g_eli_auth_read_done(struct cryptop *crp) */ if (bp->bio_inbed < bp->bio_children) return (0); + sc = bp->bio_to->geom->softc; if (bp->bio_error == 0) { - struct g_eli_softc *sc; u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize; u_char *srcdata, *dstdata, *auth; off_t coroff, corsize; @@ -161,7 +162,6 @@ g_eli_auth_read_done(struct cryptop *crp) /* * Verify data integrity based on calculated and read HMACs. */ - sc = bp->bio_to->geom->softc; /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; /* The real sectorsize of encrypted provider, eg. 512. */ @@ -240,6 +240,7 @@ g_eli_auth_read_done(struct cryptop *crp) * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); return (0); } @@ -276,6 +277,7 @@ g_eli_auth_write_done(struct cryptop *crp) */ if (bp->bio_inbed < bp->bio_children) return (0); + sc = bp->bio_to->geom->softc; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); @@ -285,9 +287,9 @@ g_eli_auth_write_done(struct cryptop *crp) bp->bio_driver1 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); return (0); } - sc = bp->bio_to->geom->softc; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; @@ -392,6 +394,11 @@ g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp) /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). + * + * BIO_READ: + * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver + * BIO_WRITE: + * g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp) diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c index a6f572b..ee133c6 100644 --- a/sys/geom/eli/g_eli_privacy.c +++ b/sys/geom/eli/g_eli_privacy.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); /* * Code paths: * BIO_READ: - * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver + * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ @@ -63,11 +63,12 @@ MALLOC_DECLARE(M_ELI); /* * The function is called after we read and decrypt data. * - * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver + * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver */ static int g_eli_crypto_read_done(struct cryptop *crp) { + struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { @@ -101,7 +102,9 @@ g_eli_crypto_read_done(struct cryptop *crp) /* * Read is finished, send it up. */ + sc = bp->bio_to->geom->softc; g_io_deliver(bp, bp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); return (0); } @@ -113,6 +116,7 @@ g_eli_crypto_read_done(struct cryptop *crp) static int g_eli_crypto_write_done(struct cryptop *crp) { + struct g_eli_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *bp, *cbp; @@ -141,18 +145,20 @@ g_eli_crypto_write_done(struct cryptop *crp) bp->bio_children = 1; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; + gp = bp->bio_to->geom; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); + sc = gp->softc; g_io_deliver(bp, bp->bio_error); + atomic_subtract_int(&sc->sc_inflight, 1); return (0); } cbp->bio_data = bp->bio_driver2; cbp->bio_done = g_eli_write_done; - gp = bp->bio_to->geom; cp = LIST_FIRST(&gp->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); @@ -164,8 +170,57 @@ g_eli_crypto_write_done(struct cryptop *crp) } /* + * The function is called to read encrypted data. + * + * g_eli_start -> G_ELI_CRYPTO_READ -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver + */ +void +g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker) +{ + struct g_consumer *cp; + struct bio *cbp; + + if (!fromworker) { + /* + * We are not called from the worker thread, so check if + * device is suspended. + */ + mtx_lock(&sc->sc_queue_mtx); + if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { + /* + * If device is suspended, we place the request onto + * the queue, so it can be handled after resume. + */ + G_ELI_DEBUG(0, "device suspended, move onto queue"); + bioq_insert_tail(&sc->sc_queue, bp); + mtx_unlock(&sc->sc_queue_mtx); + wakeup(sc); + return; + } + atomic_add_int(&sc->sc_inflight, 1); + mtx_unlock(&sc->sc_queue_mtx); + } + bp->bio_pflags = 0; + bp->bio_driver2 = NULL; + cbp = bp->bio_driver1; + cbp->bio_done = g_eli_read_done; + cp = LIST_FIRST(&sc->sc_geom->consumer); + cbp->bio_to = cp->provider; + G_ELI_LOGREQ(2, cbp, "Sending request."); + /* + * Read encrypted data from provider. + */ + g_io_request(cbp, cp); +} + +/* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). + * + * BIO_READ: + * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver + * BIO_WRITE: + * g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c index 060fad5..542f4df 100644 --- a/sys/i386/xen/xen_machdep.c +++ b/sys/i386/xen/xen_machdep.c @@ -722,7 +722,9 @@ char *bootmem_start, *bootmem_current, *bootmem_end; pteinfo_t *pteinfo_list; void initvalues(start_info_t *startinfo); -struct ringbuf_head *xen_store; /* XXX move me */ +struct xenstore_domain_interface; +extern struct xenstore_domain_interface *xen_store; + char *console_page; void * @@ -1082,7 +1084,7 @@ initvalues(start_info_t *startinfo) HYPERVISOR_shared_info = (shared_info_t *)cur_space; cur_space += PAGE_SIZE; - xen_store = (struct ringbuf_head *)cur_space; + xen_store = (struct xenstore_domain_interface *)cur_space; cur_space += PAGE_SIZE; console_page = (char *)cur_space; diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 3c05530..1e4d690 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -655,16 +655,8 @@ interpret: setsugid(p); #ifdef KTRACE - if (p->p_tracevp != NULL && - priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) { - mtx_lock(&ktrace_mtx); - p->p_traceflag = 0; - tracevp = p->p_tracevp; - p->p_tracevp = NULL; - tracecred = p->p_tracecred; - p->p_tracecred = NULL; - mtx_unlock(&ktrace_mtx); - } + if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) + ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 029f1c3..31389e1 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -121,10 +121,6 @@ exit1(struct thread *td, int rv) struct proc *p, *nq, *q; struct vnode *vtmp; struct vnode *ttyvp = NULL; -#ifdef KTRACE - struct vnode *tracevp; - struct ucred *tracecred; -#endif struct plimit *plim; int locked; @@ -356,33 +352,7 @@ exit1(struct thread *td, int rv) if (ttyvp != NULL) vrele(ttyvp); #ifdef KTRACE - /* - * Disable tracing, then drain any pending records and release - * the trace file. - */ - if (p->p_traceflag != 0) { - PROC_LOCK(p); - mtx_lock(&ktrace_mtx); - p->p_traceflag = 0; - mtx_unlock(&ktrace_mtx); - PROC_UNLOCK(p); - ktrprocexit(td); - PROC_LOCK(p); - mtx_lock(&ktrace_mtx); - tracevp = p->p_tracevp; - p->p_tracevp = NULL; - tracecred = p->p_tracecred; - p->p_tracecred = NULL; - mtx_unlock(&ktrace_mtx); - PROC_UNLOCK(p); - if (tracevp != NULL) { - locked = VFS_LOCK_GIANT(tracevp->v_mount); - vrele(tracevp); - VFS_UNLOCK_GIANT(locked); - } - if (tracecred != NULL) - crfree(tracecred); - } + ktrprocexit(td); #endif /* * Release reference to text vnode diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index da2c415..126c668 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -645,21 +645,7 @@ again: callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); #ifdef KTRACE - /* - * Copy traceflag and tracefile if enabled. - */ - mtx_lock(&ktrace_mtx); - KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); - if (p1->p_traceflag & KTRFAC_INHERIT) { - p2->p_traceflag = p1->p_traceflag; - if ((p2->p_tracevp = p1->p_tracevp) != NULL) { - VREF(p2->p_tracevp); - KASSERT(p1->p_tracecred != NULL, - ("ktrace vnode with no cred")); - p2->p_tracecred = crhold(p1->p_tracecred); - } - } - mtx_unlock(&ktrace_mtx); + ktrprocfork(p1, p2); #endif /* diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c index bf530e1..6e2285b 100644 --- a/sys/kern/kern_ktrace.c +++ b/sys/kern/kern_ktrace.c @@ -126,7 +126,7 @@ SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize, 0, "Maximum size of genio event payload"); static int print_message = 1; -struct mtx ktrace_mtx; +static struct mtx ktrace_mtx; static struct sx ktrace_sx; static void ktrace_init(void *dummy); @@ -134,7 +134,10 @@ static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS); static u_int ktrace_resize_pool(u_int newsize); static struct ktr_request *ktr_getrequest(int type); static void ktr_submitrequest(struct thread *td, struct ktr_request *req); +static void ktr_freeproc(struct proc *p, struct ucred **uc, + struct vnode **vp); static void ktr_freerequest(struct ktr_request *req); +static void ktr_freerequest_locked(struct ktr_request *req); static void ktr_writerequest(struct thread *td, struct ktr_request *req); static int ktrcanset(struct thread *,struct proc *); static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *); @@ -375,11 +378,43 @@ static void ktr_freerequest(struct ktr_request *req) { + mtx_lock(&ktrace_mtx); + ktr_freerequest_locked(req); + mtx_unlock(&ktrace_mtx); +} + +static void +ktr_freerequest_locked(struct ktr_request *req) +{ + + mtx_assert(&ktrace_mtx, MA_OWNED); if (req->ktr_buffer != NULL) free(req->ktr_buffer, M_KTRACE); - mtx_lock(&ktrace_mtx); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); - mtx_unlock(&ktrace_mtx); +} + +/* + * Disable tracing for a process and release all associated resources. + * The caller is responsible for releasing a reference on the returned + * vnode and credentials. + */ +static void +ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp) +{ + struct ktr_request *req; + + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_assert(&ktrace_mtx, MA_OWNED); + *uc = p->p_tracecred; + p->p_tracecred = NULL; + if (vp != NULL) + *vp = p->p_tracevp; + p->p_tracevp = NULL; + p->p_traceflag = 0; + while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) { + STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list); + ktr_freerequest_locked(req); + } } void @@ -432,20 +467,79 @@ ktrsysret(code, error, retval) } /* - * When a process exits, drain per-process asynchronous trace records. + * When a setuid process execs, disable tracing. + * + * XXX: We toss any pending asynchronous records. + */ +void +ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_lock(&ktrace_mtx); + ktr_freeproc(p, uc, vp); + mtx_unlock(&ktrace_mtx); +} + +/* + * When a process exits, drain per-process asynchronous trace records + * and disable tracing. */ void ktrprocexit(struct thread *td) { + struct proc *p; + struct ucred *cred; + struct vnode *vp; + int vfslocked; + + p = td->td_proc; + if (p->p_traceflag == 0) + return; ktrace_enter(td); sx_xlock(&ktrace_sx); ktr_drain(td); sx_xunlock(&ktrace_sx); + PROC_LOCK(p); + mtx_lock(&ktrace_mtx); + ktr_freeproc(p, &cred, &vp); + mtx_unlock(&ktrace_mtx); + PROC_UNLOCK(p); + if (vp != NULL) { + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + } + if (cred != NULL) + crfree(cred); ktrace_exit(td); } /* + * When a process forks, enable tracing in the new process if needed. + */ +void +ktrprocfork(struct proc *p1, struct proc *p2) +{ + + PROC_LOCK_ASSERT(p1, MA_OWNED); + PROC_LOCK_ASSERT(p2, MA_OWNED); + mtx_lock(&ktrace_mtx); + KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); + if (p1->p_traceflag & KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracevp = p1->p_tracevp) != NULL) { + VREF(p2->p_tracevp); + KASSERT(p1->p_tracecred != NULL, + ("ktrace vnode with no cred")); + p2->p_tracecred = crhold(p1->p_tracecred); + } + } + mtx_unlock(&ktrace_mtx); +} + +/* * When a thread returns, drain any asynchronous records generated by the * system call. */ @@ -694,10 +788,7 @@ ktrace(td, uap) if (p->p_tracevp == vp) { if (ktrcanset(td, p)) { mtx_lock(&ktrace_mtx); - cred = p->p_tracecred; - p->p_tracecred = NULL; - p->p_tracevp = NULL; - p->p_traceflag = 0; + ktr_freeproc(p, &cred, NULL); mtx_unlock(&ktrace_mtx); vrele_count++; crfree(cred); @@ -864,14 +955,9 @@ ktrops(td, p, ops, facs, vp) p->p_traceflag |= KTRFAC_ROOT; } else { /* KTROP_CLEAR */ - if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { + if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) /* no more tracing */ - p->p_traceflag = 0; - tracevp = p->p_tracevp; - p->p_tracevp = NULL; - tracecred = p->p_tracecred; - p->p_tracecred = NULL; - } + ktr_freeproc(p, &tracecred, &tracevp); } mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); @@ -1036,10 +1122,7 @@ ktr_writerequest(struct thread *td, struct ktr_request *req) PROC_LOCK(p); if (p->p_tracevp == vp) { mtx_lock(&ktrace_mtx); - p->p_tracevp = NULL; - p->p_traceflag = 0; - cred = p->p_tracecred; - p->p_tracecred = NULL; + ktr_freeproc(p, &cred, NULL); mtx_unlock(&ktrace_mtx); vrele_count++; } @@ -1051,11 +1134,6 @@ ktr_writerequest(struct thread *td, struct ktr_request *req) } sx_sunlock(&allproc_lock); - /* - * We can't clear any pending requests in threads that have cached - * them but not yet committed them, as those are per-thread. The - * thread will have to clear it itself on system call return. - */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); while (vrele_count-- > 0) vrele(vp); diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c index d1a5c0d..03f6088 100644 --- a/sys/kern/kern_syscalls.c +++ b/sys/kern/kern_syscalls.c @@ -181,13 +181,12 @@ syscall_module_handler(struct module *mod, int what, void *arg) error = syscall_deregister(data->offset, &data->old_sysent); return (error); default: - return EOPNOTSUPP; + if (data->chainevh) + return (data->chainevh(mod, what, data->chainarg)); + return (EOPNOTSUPP); } - if (data->chainevh) - return (data->chainevh(mod, what, data->chainarg)); - else - return (0); + /* NOTREACHED */ } int diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 7627027..3a9c721 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/sysproto.h> #include <sys/signalvar.h> -#include <sys/sx.h> #include <sys/ucontext.h> #include <sys/thr.h> #include <sys/rtprio.h> @@ -431,40 +430,40 @@ thr_suspend(struct thread *td, struct thr_suspend_args *uap) int kern_thr_suspend(struct thread *td, struct timespec *tsp) { + struct proc *p = td->td_proc; struct timeval tv; int error = 0; int timo = 0; - if (tsp != NULL) { - if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000) - return (EINVAL); - } - if (td->td_pflags & TDP_WAKEUP) { td->td_pflags &= ~TDP_WAKEUP; return (0); } - PROC_LOCK(td->td_proc); - if ((td->td_flags & TDF_THRWAKEUP) == 0) { + if (tsp != NULL) { + if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000) + return (EINVAL); if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) error = EWOULDBLOCK; else { TIMESPEC_TO_TIMEVAL(&tv, tsp); timo = tvtohz(&tv); - error = msleep((void *)td, &td->td_proc->p_mtx, - PCATCH, "lthr", timo); } } + PROC_LOCK(p); + if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0) + error = msleep((void *)td, &p->p_mtx, + PCATCH, "lthr", timo); + if (td->td_flags & TDF_THRWAKEUP) { thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; thread_unlock(td); - PROC_UNLOCK(td->td_proc); + PROC_UNLOCK(p); return (0); } - PROC_UNLOCK(td->td_proc); + PROC_UNLOCK(p); if (error == EWOULDBLOCK) error = ETIMEDOUT; else if (error == ERESTART) { diff --git a/sys/mips/rmi/board.c b/sys/mips/rmi/board.c index 3bee863..4c49dac 100644 --- a/sys/mips/rmi/board.c +++ b/sys/mips/rmi/board.c @@ -283,14 +283,14 @@ xls_board_specific_overrides(struct xlr_board_info* board) break; case RMI_XLR_BOARD_ARIZONA_VIII: - - if (blk1->enabled) { + if (blk1->enabled) { /* There is just one Octal PHY on the board and it is * connected to the MII interface for NA Quad 0. */ - blk1->gmac_port[0].mii_addr = XLR_IO_GMAC_0_OFFSET; - blk1->gmac_port[1].mii_addr = XLR_IO_GMAC_0_OFFSET; - blk1->gmac_port[2].mii_addr = XLR_IO_GMAC_0_OFFSET; - blk1->gmac_port[3].mii_addr = XLR_IO_GMAC_0_OFFSET; + for (i = 0; i < 4; i++) { + blk1->gmac_port[i].mii_addr = + XLR_IO_GMAC_0_OFFSET; + blk1->gmac_port[i].mdint_id = 0; + } } break; diff --git a/sys/mips/rmi/dev/nlge/if_nlge.c b/sys/mips/rmi/dev/nlge/if_nlge.c index 6495e4b..37e1c54 100644 --- a/sys/mips/rmi/dev/nlge/if_nlge.c +++ b/sys/mips/rmi/dev/nlge/if_nlge.c @@ -861,7 +861,7 @@ nlge_mii_read(struct device *dev, int phyaddr, int regidx) int val; sc = device_get_softc(dev); - val = (sc->port_type != XLR_XGMII) ? (0xffff) : + val = (sc->port_type == XLR_XGMII) ? (0xffff) : nlge_mii_read_internal(sc->mii_base, phyaddr, regidx); return (val); diff --git a/sys/mips/rmi/xlr_machdep.c b/sys/mips/rmi/xlr_machdep.c index b34955d..8f96633 100644 --- a/sys/mips/rmi/xlr_machdep.c +++ b/sys/mips/rmi/xlr_machdep.c @@ -167,6 +167,14 @@ xlr_parse_mmu_options(void) */ xlr_ncores = 1; cpu_map = xlr_boot1_info.cpu_online_map; + +#ifndef SMP /* Uniprocessor! */ + if (cpu_map != 0x1) { + printf("WARNING: Starting uniprocessor kernel on cpumask [0x%lx]!\n" + "WARNING: Other CPUs will be unused.\n", (u_long)cpu_map); + cpu_map = 0x1; + } +#endif core0_thr_mask = cpu_map & 0xf; switch (core0_thr_mask) { case 1: @@ -188,9 +196,9 @@ xlr_parse_mmu_options(void) xlr_ncores++; } } + xlr_hw_thread_mask = cpu_map; /* setup hardware processor id to cpu id mapping */ - xlr_hw_thread_mask = xlr_boot1_info.cpu_online_map; for (i = 0; i< MAXCPU; i++) xlr_cpuid_to_hwtid[i] = xlr_hwtid_to_cpuid [i] = -1; diff --git a/sys/net/if.c b/sys/net/if.c index bd54acf..3c8486a 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -92,6 +92,11 @@ #include <security/mac/mac_framework.h> +#ifdef COMPAT_FREEBSD32 +#include <sys/mount.h> +#include <compat/freebsd32/freebsd32.h> +#endif + struct ifindex_entry { struct ifnet *ife_ifnet; }; @@ -2402,6 +2407,17 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); } +#ifdef COMPAT_FREEBSD32 +struct ifconf32 { + int32_t ifc_len; + union { + uint32_t ifcu_buf; + uint32_t ifcu_req; + } ifc_ifcu; +}; +#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) +#endif + /* * Interface ioctls. */ @@ -2416,10 +2432,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) switch (cmd) { case SIOCGIFCONF: case OSIOCGIFCONF: -#ifdef __amd64__ + return (ifconf(cmd, data)); + +#ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: + { + struct ifconf32 *ifc32; + struct ifconf ifc; + + ifc32 = (struct ifconf32 *)data; + ifc.ifc_len = ifc32->ifc_len; + ifc.ifc_buf = PTRIN(ifc32->ifc_buf); + + return (ifconf(SIOCGIFCONF, (void *)&ifc)); + } #endif - return (ifconf(cmd, data)); } ifr = (struct ifreq *)data; @@ -2646,23 +2673,12 @@ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; -#ifdef __amd64__ - struct ifconf32 *ifc32 = (struct ifconf32 *)data; - struct ifconf ifc_swab; -#endif struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; -#ifdef __amd64__ - if (cmd == SIOCGIFCONF32) { - ifc_swab.ifc_len = ifc32->ifc_len; - ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf; - ifc = &ifc_swab; - } -#endif /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; @@ -2752,10 +2768,6 @@ again: } ifc->ifc_len = valid_len; -#ifdef __amd64__ - if (cmd == SIOCGIFCONF32) - ifc32->ifc_len = valid_len; -#endif sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); diff --git a/sys/net/if.h b/sys/net/if.h index ae0daf5..a99b4a7 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -391,16 +391,6 @@ struct ifconf { #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; -#if defined (__amd64__) -struct ifconf32 { - int ifc_len; /* size of associated buffer */ - union { - u_int ifcu_buf; - u_int ifcu_req; - } ifc_ifcu; -}; -#endif - /* * interface groups */ diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h index a3e5469..9f8810c 100644 --- a/sys/sys/ktrace.h +++ b/sys/sys/ktrace.h @@ -191,8 +191,6 @@ struct stat; #define KTRFAC_DROP 0x20000000 /* last event was dropped */ #ifdef _KERNEL -extern struct mtx ktrace_mtx; - void ktrnamei(char *); void ktrcsw(int, int); void ktrpsig(int, sig_t, sigset_t *, int); @@ -200,7 +198,9 @@ void ktrgenio(int, enum uio_rw, struct uio *, int); void ktrsyscall(int, int narg, register_t args[]); void ktrsysctl(int *name, u_int namelen); void ktrsysret(int, int, register_t); +void ktrprocexec(struct proc *, struct ucred **, struct vnode **); void ktrprocexit(struct thread *); +void ktrprocfork(struct proc *, struct proc *); void ktruserret(struct thread *); void ktrstruct(const char *, void *, size_t); #define ktrsockaddr(s) \ diff --git a/sys/sys/sockio.h b/sys/sys/sockio.h index 2af2467..4c1c483 100644 --- a/sys/sys/sockio.h +++ b/sys/sys/sockio.h @@ -62,9 +62,6 @@ #define SIOCSIFBRDADDR _IOW('i', 19, struct ifreq) /* set broadcast addr */ #define OSIOCGIFCONF _IOWR('i', 20, struct ifconf) /* get ifnet list */ #define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */ -#if defined (__amd64__) -#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) /* get ifnet list */ -#endif #define OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) /* get net addr mask */ #define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */ #define SIOCSIFNETMASK _IOW('i', 22, struct ifreq) /* set net addr mask */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index b359bd4..bea235a 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1460,8 +1460,8 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * - * For READ operations, the pages are PG_BUSY'd. For WRITE operations, - * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY + * For READ operations, the pages are VPO_BUSY'd. For WRITE operations, + * the pages are vm_page_t->busy'd. For READ operations, we VPO_BUSY * unbusy all pages except the 'main' request page. For WRITE * operations, we vm_page_t->busy'd unbusy all pages ( we can do this * because we marked them all VM_PAGER_PEND on return from putpages ). diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 2e0f001..40c317d 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -339,15 +339,11 @@ vmspace_dofree(struct vmspace *vm) void vmspace_free(struct vmspace *vm) { - int refcnt; if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); - do - refcnt = vm->vm_refcnt; - while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); - if (refcnt == 1) + if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) vmspace_dofree(vm); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 18a1edf..8715b41 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -236,7 +236,7 @@ struct vmspace { caddr_t vm_taddr; /* (c) user virtual address of text */ caddr_t vm_daddr; /* (c) user virtual address of data */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ - int vm_refcnt; /* number of references */ + volatile int vm_refcnt; /* number of references */ /* * Keep the PMAP last, so that CPU-specific variations of that * structure on a single architecture don't result in offset diff --git a/sys/xen/blkif.h b/sys/xen/blkif.h new file mode 100644 index 0000000..48b71ea --- /dev/null +++ b/sys/xen/blkif.h @@ -0,0 +1,145 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef __XEN_BLKIF_H__ +#define __XEN_BLKIF_H__ + +#include <xen/interface/io/ring.h> +#include <xen/interface/io/blkif.h> +#include <xen/interface/io/protocols.h> + +/* Not a real protocol. Used to generate ring structs which contain + * the elements common to all protocols only. This way we get a + * compiler-checkable way to use common struct elements, so we can + * avoid using switch(protocol) in a number of places. */ +struct blkif_common_request { + char dummy; +}; +struct blkif_common_response { + char dummy; +}; + +/* i386 protocol version */ +#pragma pack(push, 4) +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK]; +}; +struct blkif_x86_32_response { + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_32_request blkif_x86_32_request_t; +typedef struct blkif_x86_32_response blkif_x86_32_response_t; +#pragma pack(pop) + +/* x86_64 protocol version */ +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t __attribute__((__aligned__(8))) id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK]; +}; +struct blkif_x86_64_response { + uint64_t __attribute__((__aligned__(8))) id; + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_64_request blkif_x86_64_request_t; +typedef struct blkif_x86_64_response blkif_x86_64_response_t; + +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); + +/* + * Maximum number of requests that can be active for a given instance + * regardless of the protocol in use, based on the ring size. This constant + * facilitates resource pre-allocation in backend drivers since the size is + * known well in advance of attaching to a front end. + */ +#define BLKIF_MAX_RING_REQUESTS(_sz) \ + MAX(__RING_SIZE((blkif_x86_64_sring_t *)NULL, _sz), \ + MAX(__RING_SIZE((blkif_x86_32_sring_t *)NULL, _sz), \ + __RING_SIZE((blkif_sring_t *)NULL, _sz))) + +/* + * The number of ring pages required to support a given number of requests + * for a given instance regardless of the protocol in use. + */ +#define BLKIF_RING_PAGES(_entries) \ + MAX(__RING_PAGES((blkif_x86_64_sring_t *)NULL, _entries), \ + MAX(__RING_PAGES((blkif_x86_32_sring_t *)NULL, _entries), \ + __RING_PAGES((blkif_sring_t *)NULL, _entries))) + +union blkif_back_rings { + blkif_back_ring_t native; + blkif_common_back_ring_t common; + blkif_x86_32_back_ring_t x86_32; + blkif_x86_64_back_ring_t x86_64; +}; +typedef union blkif_back_rings blkif_back_rings_t; + +enum blkif_protocol { + BLKIF_PROTOCOL_NATIVE = 1, + BLKIF_PROTOCOL_X86_32 = 2, + BLKIF_PROTOCOL_X86_64 = 3, +}; + +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +#endif /* __XEN_BLKIF_H__ */ diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c index f280d12..3832277 100644 --- a/sys/xen/evtchn/evtchn.c +++ b/sys/xen/evtchn/evtchn.c @@ -492,15 +492,15 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain, int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, const char *devname, - driver_filter_t filter, driver_intr_t handler, - unsigned long irqflags, unsigned int *irqp) + driver_intr_t handler, void *arg, unsigned long irqflags, + unsigned int *irqp) { unsigned int irq; int error; irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); intr_register_source(&xp->xp_pins[irq].xp_intsrc); - error = intr_add_handler(devname, irq, filter, handler, NULL, + error = intr_add_handler(devname, irq, NULL, handler, arg, irqflags, &xp->xp_pins[irq].xp_cookie); if (error) { unbind_from_irq(irq); diff --git a/sys/xen/gnttab.c b/sys/xen/gnttab.c index ae44e8f..4ece182 100644 --- a/sys/xen/gnttab.c +++ b/sys/xen/gnttab.c @@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$"); /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 -#define GNTTAB_LIST_END 0xffffffff #define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t)) static grant_ref_t **gnttab_list; @@ -66,7 +65,7 @@ get_free_entries(int count, int *entries) { int ref, error; grant_ref_t head; - + mtx_lock(&gnttab_list_lock); if ((gnttab_free_count < count) && ((error = gnttab_expand(count - gnttab_free_count)) != 0)) { @@ -79,7 +78,7 @@ get_free_entries(int count, int *entries) head = gnttab_entry(head); gnttab_free_head = gnttab_entry(head); gnttab_entry(head) = GNTTAB_LIST_END; - mtx_unlock(&gnttab_list_lock); + mtx_unlock(&gnttab_list_lock); *entries = ref; return (0); @@ -122,7 +121,7 @@ put_free_entry(grant_ref_t ref) gnttab_free_head = ref; gnttab_free_count++; check_free_callbacks(); - mtx_unlock(&gnttab_list_lock); + mtx_unlock(&gnttab_list_lock); } /* @@ -136,7 +135,7 @@ gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly, int error, ref; error = get_free_entries(1, &ref); - + if (unlikely(error)) return (error); @@ -166,9 +165,9 @@ int gnttab_query_foreign_access(grant_ref_t ref) { uint16_t nflags; - + nflags = shared[ref].flags; - + return (nflags & (GTF_reading|GTF_writing)); } @@ -180,7 +179,7 @@ gnttab_end_foreign_access_ref(grant_ref_t ref) nflags = shared[ref].flags; do { if ( (flags = nflags) & (GTF_reading|GTF_writing) ) { - printf("WARNING: g.e. still in use!\n"); + printf("%s: WARNING: g.e. still in use!\n", __func__); return (0); } } while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != @@ -201,7 +200,44 @@ gnttab_end_foreign_access(grant_ref_t ref, void *page) else { /* XXX This needs to be fixed so that the ref and page are placed on a list to be freed up later. */ - printf("WARNING: leaking g.e. and page still in use!\n"); + printf("%s: WARNING: leaking g.e. and page still in use!\n", + __func__); + } +} + +void +gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs) +{ + grant_ref_t *last_ref; + grant_ref_t head; + grant_ref_t tail; + + head = GNTTAB_LIST_END; + tail = *refs; + last_ref = refs + count; + while (refs != last_ref) { + + if (gnttab_end_foreign_access_ref(*refs)) { + gnttab_entry(*refs) = head; + head = *refs; + } else { + /* + * XXX This needs to be fixed so that the ref + * is placed on a list to be freed up later. + */ + printf("%s: WARNING: leaking g.e. still in use!\n", + __func__); + count--; + } + refs++; + } + + if (count != 0) { + mtx_lock(&gnttab_list_lock); + gnttab_free_count += count; + gnttab_entry(tail) = gnttab_free_head; + gnttab_free_head = head; + mtx_unlock(&gnttab_list_lock); } } @@ -216,7 +252,7 @@ gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, return (error); gnttab_grant_foreign_transfer_ref(ref, domid, pfn); - + *result = ref; return (0); } @@ -282,16 +318,16 @@ gnttab_free_grant_references(grant_ref_t head) { grant_ref_t ref; int count = 1; - + if (head == GNTTAB_LIST_END) return; - - mtx_lock(&gnttab_list_lock); + ref = head; while (gnttab_entry(ref) != GNTTAB_LIST_END) { ref = gnttab_entry(ref); count++; } + mtx_lock(&gnttab_list_lock); gnttab_entry(ref) = gnttab_free_head; gnttab_free_head = head; gnttab_free_count += count; @@ -403,7 +439,7 @@ grow_gnttab_list(unsigned int more_frames) check_free_callbacks(); return (0); - + grow_nomem: for ( ; i >= nr_grant_frames; i--) free(gnttab_list[i], M_DEVBUF); @@ -490,7 +526,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx) if (shared == NULL) { vm_offset_t area; - + area = kmem_alloc_nofault(kernel_map, PAGE_SIZE * max_nr_grant_frames()); KASSERT(area, ("can't allocate VM space for grant table")); @@ -502,7 +538,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx) ((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V); free(frames, M_DEVBUF); - + return (0); } @@ -517,7 +553,7 @@ gnttab_resume(void) int gnttab_suspend(void) -{ +{ int i; for (i = 0; i < nr_grant_frames; i++) @@ -532,7 +568,8 @@ gnttab_suspend(void) static vm_paddr_t resume_frames; -static int gnttab_map(unsigned int start_idx, unsigned int end_idx) +static int +gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; @@ -552,7 +589,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) if (shared == NULL) { vm_offset_t area; - + area = kmem_alloc_nofault(kernel_map, PAGE_SIZE * max_nr_grant_frames()); KASSERT(area, ("can't allocate VM space for grant table")); @@ -643,10 +680,10 @@ gnttab_init() if (gnttab_list[i] == NULL) goto ini_nomem; } - + if (gnttab_resume()) return (ENODEV); - + nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) @@ -670,4 +707,3 @@ ini_nomem: } MTX_SYSINIT(gnttab, &gnttab_list_lock, "GNTTAB LOCK", MTX_DEF); -//SYSINIT(gnttab, SI_SUB_PSEUDO, SI_ORDER_FIRST, gnttab_init, NULL); diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h index 8348af5..1741ec3 100644 --- a/sys/xen/gnttab.h +++ b/sys/xen/gnttab.h @@ -43,6 +43,8 @@ #include <machine/xen/xen-os.h> #include <xen/features.h> +#define GNTTAB_LIST_END GRANT_REF_INVALID + struct gnttab_free_callback { struct gnttab_free_callback *next; void (*fn)(void *); @@ -74,6 +76,13 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref); */ void gnttab_end_foreign_access(grant_ref_t ref, void *page); +/* + * Eventually end access through the given array of grant references. + * Access will be ended immediately iff the grant entry is not in use, + * otherwise it will happen some time later + */ +void gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); diff --git a/sys/xen/interface/grant_table.h b/sys/xen/interface/grant_table.h index 26f2c35..e76ca67 100644 --- a/sys/xen/interface/grant_table.h +++ b/sys/xen/interface/grant_table.h @@ -159,6 +159,8 @@ typedef struct grant_entry grant_entry_t; */ typedef uint32_t grant_ref_t; +#define GRANT_REF_INVALID 0xffffffff + /* * Handle to track a mapping created via a grant reference. */ diff --git a/sys/xen/interface/hvm/params.h b/sys/xen/interface/hvm/params.h index 6befa78..d846731 100644 --- a/sys/xen/interface/hvm/params.h +++ b/sys/xen/interface/hvm/params.h @@ -95,4 +95,30 @@ #define HVM_NR_PARAMS 15 +#ifdef XENHVM +/** + * Retrieve an HVM setting from the hypervisor. + * + * \param index The index of the HVM parameter to retrieve. + * + * \return On error, 0. Otherwise the value of the requested parameter. + */ +static inline unsigned long +hvm_get_parameter(int index) +{ + struct xen_hvm_param xhv; + int error; + + xhv.domid = DOMID_SELF; + xhv.index = index; + error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); + if (error) { + printf("hvm_get_parameter: failed to get %d, error %d\n", + index, error); + return (0); + } + return (xhv.value); +} +#endif + #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h index 9e2d3d0..020936b 100644 --- a/sys/xen/interface/io/blkif.h +++ b/sys/xen/interface/io/blkif.h @@ -78,11 +78,19 @@ #define BLKIF_OP_FLUSH_DISKCACHE 3 /* - * Maximum scatter/gather segments per request. - * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. - * NB. This could be 12 if the ring indexes weren't stored in the same page. + * Maximum scatter/gather segments associated with a request header block. */ -#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 +#define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK 11 + +/* + * Maximum scatter/gather segments associated with a segment block. + */ +#define BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK 14 + +/* + * Maximum scatter/gather segments per request (header + segment blocks). + */ +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 255 struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -90,6 +98,7 @@ struct blkif_request_segment { /* @last_sect: last sector in frame to transfer (inclusive). */ uint8_t first_sect, last_sect; }; +typedef struct blkif_request_segment blkif_request_segment_t; struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ @@ -97,7 +106,7 @@ struct blkif_request { blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK]; }; typedef struct blkif_request blkif_request_t; @@ -124,10 +133,22 @@ typedef struct blkif_response blkif_response_t; DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); +#define BLKRING_GET_SG_REQUEST(_r, _idx) \ + ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx)) + #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 +/* + * The number of ring request blocks required to handle an I/O + * request containing _segs segments. + */ +#define BLKIF_SEGS_TO_BLOCKS(_segs) \ + ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \ + + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \ + / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1) + #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ /* diff --git a/sys/xen/interface/io/protocols.h b/sys/xen/interface/io/protocols.h index 77bd1bd..fd52934 100644 --- a/sys/xen/interface/io/protocols.h +++ b/sys/xen/interface/io/protocols.h @@ -26,6 +26,7 @@ #define XEN_IO_PROTO_ABI_X86_32 "x86_32-abi" #define XEN_IO_PROTO_ABI_X86_64 "x86_64-abi" #define XEN_IO_PROTO_ABI_IA64 "ia64-abi" +#define XEN_IO_PROTO_ABI_POWERPC64 "powerpc64-abi" #if defined(__i386__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32 @@ -33,6 +34,8 @@ # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64 #elif defined(__ia64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64 +#elif defined(__powerpc64__) +# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64 #else # error arch fixup needed here #endif diff --git a/sys/xen/interface/io/ring.h b/sys/xen/interface/io/ring.h index 6ce1d0d..6b7fd74 100644 --- a/sys/xen/interface/io/ring.h +++ b/sys/xen/interface/io/ring.h @@ -45,13 +45,29 @@ typedef unsigned int RING_IDX; #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) /* + * The amount of space reserved in the shared ring for accounting information. + */ +#define __RING_HEADER_SIZE(_s) \ + ((intptr_t)(_s)->ring - (intptr_t)(_s)) + +/* * Calculate size of a shared ring, given the total available space for the * ring and indexes (_sz), and the name tag of the request/response structure. * A ring contains as many entries as will fit, rounded down to the nearest * power of two (so we can mask with (size-1) to loop around). */ #define __RING_SIZE(_s, _sz) \ - (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) + (__RD32(((_sz) - __RING_HEADER_SIZE(_s)) / sizeof((_s)->ring[0]))) + +/* + * The number of pages needed to support a given number of request/reponse + * entries. The entry count is rounded down to the nearest power of two + * as required by the ring macros. + */ +#define __RING_PAGES(_s, _entries) \ + ((__RING_HEADER_SIZE(_s) \ + + (__RD32(_entries) * sizeof((_s)->ring[0])) \ + + PAGE_SIZE - 1) / PAGE_SIZE) /* * Macros to make the correct C datatypes for a new kind of ring. diff --git a/sys/xen/interface/io/xenbus.h b/sys/xen/interface/io/xenbus.h index 4a053df..5e24f31 100644 --- a/sys/xen/interface/io/xenbus.h +++ b/sys/xen/interface/io/xenbus.h @@ -36,6 +36,9 @@ enum xenbus_state { XenbusStateUnknown = 0, + /* + * Initializing: Back-end is initializing. + */ XenbusStateInitialising = 1, /* @@ -49,6 +52,9 @@ enum xenbus_state { */ XenbusStateInitialised = 3, + /* + * Connected: The normal state for a front to backend connection. + */ XenbusStateConnected = 4, /* @@ -56,6 +62,9 @@ enum xenbus_state { */ XenbusStateClosing = 5, + /* + * Closed: No connection exists between front and back end. + */ XenbusStateClosed = 6, /* diff --git a/sys/xen/reboot.c b/sys/xen/reboot.c deleted file mode 100644 index 04ba132..0000000 --- a/sys/xen/reboot.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * - * Copyright (c) 2004 Christian Limpach. - * Copyright (c) 2004-2006,2008 Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christian Limpach. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/bus.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/proc.h> -#include <sys/reboot.h> -#include <sys/sched.h> -#include <sys/smp.h> -#include <sys/systm.h> - -#include <machine/xen/xen-os.h> -#include <xen/hypervisor.h> -#include <xen/gnttab.h> -#include <xen/xen_intr.h> -#include <xen/xenbus/xenbusvar.h> - -#include <vm/vm.h> -#include <vm/pmap.h> - -#ifdef XENHVM - -#include <dev/xen/xenpci/xenpcivar.h> - -#else - -static void xen_suspend(void); - -#endif - -static void -shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - char *str; - struct xenbus_transaction xbt; - int error, howto; - - howto = 0; - - again: - error = xenbus_transaction_start(&xbt); - if (error) - return; - - error = xenbus_read(xbt, "control", "shutdown", NULL, (void **) &str); - - /* Ignore read errors and empty reads. */ - if (error || strlen(str) == 0) { - xenbus_transaction_end(xbt, 1); - return; - } - - xenbus_write(xbt, "control", "shutdown", ""); - - error = xenbus_transaction_end(xbt, 0); - if (error == EAGAIN) { - free(str, M_DEVBUF); - goto again; - } - - if (strcmp(str, "reboot") == 0) - howto = 0; - else if (strcmp(str, "poweroff") == 0) - howto |= (RB_POWEROFF | RB_HALT); - else if (strcmp(str, "halt") == 0) -#ifdef XENHVM - /* - * We rely on acpi powerdown to halt the VM. - */ - howto |= (RB_POWEROFF | RB_HALT); -#else - howto |= RB_HALT; -#endif - else if (strcmp(str, "suspend") == 0) - howto = -1; - else { - printf("Ignoring shutdown request: %s\n", str); - goto done; - } - - if (howto == -1) { - xen_suspend(); - goto done; - } - - shutdown_nice(howto); - done: - free(str, M_DEVBUF); -} - -#ifndef XENHVM - -/* - * In HV mode, we let acpi take care of halts and reboots. - */ - -static void -xen_shutdown_final(void *arg, int howto) -{ - - if (howto & (RB_HALT | RB_POWEROFF)) - HYPERVISOR_shutdown(SHUTDOWN_poweroff); - else - HYPERVISOR_shutdown(SHUTDOWN_reboot); -} - -#endif - -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - -static void -setup_shutdown_watcher(void *unused) -{ - - if (register_xenbus_watch(&shutdown_watch)) - printf("Failed to set shutdown watcher\n"); -#ifndef XENHVM - EVENTHANDLER_REGISTER(shutdown_final, xen_shutdown_final, NULL, - SHUTDOWN_PRI_LAST); -#endif -} - -SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL); - -#ifndef XENHVM - -extern void xencons_suspend(void); -extern void xencons_resume(void); - -static void -xen_suspend() -{ - int i, j, k, fpp; - unsigned long max_pfn, start_info_mfn; - -#ifdef SMP - cpumask_t map; - /* - * Bind us to CPU 0 and stop any other VCPUs. - */ - thread_lock(curthread); - sched_bind(curthread, 0); - thread_unlock(curthread); - KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0")); - - map = PCPU_GET(other_cpus) & ~stopped_cpus; - if (map) - stop_cpus(map); -#endif - - if (DEVICE_SUSPEND(root_bus) != 0) { - printf("xen_suspend: device_suspend failed\n"); -#ifdef SMP - if (map) - restart_cpus(map); -#endif - return; - } - - local_irq_disable(); - - xencons_suspend(); - gnttab_suspend(); - - max_pfn = HYPERVISOR_shared_info->arch.max_pfn; - - void *shared_info = HYPERVISOR_shared_info; - HYPERVISOR_shared_info = NULL; - pmap_kremove((vm_offset_t) shared_info); - PT_UPDATES_FLUSH(); - - xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn); - - /* - * We'll stop somewhere inside this hypercall. When it returns, - * we'll start resuming after the restore. - */ - start_info_mfn = VTOMFN(xen_start_info); - pmap_suspend(); - HYPERVISOR_suspend(start_info_mfn); - pmap_resume(); - - pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info); - HYPERVISOR_shared_info = shared_info; - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - VTOMFN(xen_pfn_to_mfn_frame_list_list); - - fpp = PAGE_SIZE/sizeof(unsigned long); - for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { - if ((j % fpp) == 0) { - k++; - xen_pfn_to_mfn_frame_list_list[k] = - VTOMFN(xen_pfn_to_mfn_frame_list[k]); - j = 0; - } - xen_pfn_to_mfn_frame_list[k][j] = - VTOMFN(&xen_phys_machine[i]); - } - HYPERVISOR_shared_info->arch.max_pfn = max_pfn; - - gnttab_resume(); - irq_resume(); - local_irq_enable(); - xencons_resume(); - -#ifdef CONFIG_SMP - for_each_cpu(i) - vcpu_prepare(i); - -#endif - /* - * Only resume xenbus /after/ we've prepared our VCPUs; otherwise - * the VCPU hotplug callback can race with our vcpu_prepare - */ - DEVICE_RESUME(root_bus); - -#ifdef SMP - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - if (map) - restart_cpus(map); -#endif -} - -#endif diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h index 68f5943..2e753e6 100644 --- a/sys/xen/xen_intr.h +++ b/sys/xen/xen_intr.h @@ -76,7 +76,7 @@ extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu, */ extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, const char *devname, - driver_filter_t filter, driver_intr_t handler, + driver_intr_t handler, void *arg, unsigned long irqflags, unsigned int *irqp); /* diff --git a/sys/xen/xenbus/init.txt b/sys/xen/xenbus/init.txt deleted file mode 100644 index 4249549..0000000 --- a/sys/xen/xenbus/init.txt +++ /dev/null @@ -1,14 +0,0 @@ - - -- frontend driver initializes static xenbus_driver with _ids, _probe, _remove, -_resume, _otherend_changed - - - initialization calls xenbus_register_frontend(xenbus_driver) - - - xenbus_register_frontend sets read_otherend details to read_backend_details - then calls xenbus_register_driver_common(xenbus_driver, xenbus_frontend) - - - xenbus_register_driver_common sets underlying driver name to xenbus_driver name - underlying driver bus to xenbus_frontend's bus, driver's probe to xenbus_dev_probe - driver's remove to xenbus_dev_remove then calls driver_register - diff --git a/sys/xen/xenbus/xenbus_client.c b/sys/xen/xenbus/xenbus.c index 740d664..c3e5fee 100644 --- a/sys/xen/xenbus/xenbus_client.c +++ b/sys/xen/xenbus/xenbus.c @@ -1,8 +1,4 @@ /****************************************************************************** - * Client-facing interface for the Xenbus driver. In other words, the - * interface between the Xenbus and the device-specific code, be it the - * frontend or the backend of that driver. - * * Copyright (C) 2005 XenSource Ltd * * This file may be distributed separately from the Linux kernel, or @@ -27,6 +23,14 @@ * IN THE SOFTWARE. */ +/** + * \file xenbus.c + * + * \brief Client-facing interface for the Xenbus driver. + * + * In other words, the interface between the Xenbus and the device-specific + * code, be it the frontend or the backend of that driver. + */ #if 0 #define DPRINTK(fmt, args...) \ @@ -39,9 +43,12 @@ __FBSDID("$FreeBSD$"); #include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/kernel.h> #include <sys/types.h> #include <sys/malloc.h> #include <sys/libkern.h> +#include <sys/sbuf.h> #include <machine/xen/xen-os.h> #include <xen/hypervisor.h> @@ -50,6 +57,34 @@ __FBSDID("$FreeBSD$"); #include <xen/xenbus/xenbusvar.h> #include <machine/stdarg.h> +MALLOC_DEFINE(M_XENBUS, "xenbus", "XenBus Support"); + +/*------------------------- Private Functions --------------------------------*/ +/** + * \brief Construct the error path corresponding to the given XenBus + * device. + * + * \param dev The XenBus device for which we are constructing an error path. + * + * \return On success, the contructed error path. Otherwise NULL. + * + * It is the caller's responsibility to free any returned error path + * node using the M_XENBUS malloc type. + */ +static char * +error_path(device_t dev) +{ + char *path_buffer = malloc(strlen("error/") + + strlen(xenbus_get_node(dev)) + 1,M_XENBUS, M_WAITOK); + + strcpy(path_buffer, "error/"); + strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev)); + + return (path_buffer); +} + +/*--------------------------- Public Functions -------------------------------*/ +/*-------- API comments for these methods can be found in xenbusvar.h --------*/ const char * xenbus_strstate(XenbusState state) { @@ -67,15 +102,15 @@ xenbus_strstate(XenbusState state) } int -xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch, - void (*callback)(struct xenbus_watch *, const char **, unsigned int)) +xenbus_watch_path(device_t dev, char *path, struct xs_watch *watch, + xs_watch_cb_t *callback) { int error; watch->node = path; watch->callback = callback; - error = register_xenbus_watch(watch); + error = xs_register_watch(watch); if (error) { watch->node = NULL; @@ -88,12 +123,12 @@ xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch, int xenbus_watch_path2(device_t dev, const char *path, - const char *path2, struct xenbus_watch *watch, - void (*callback)(struct xenbus_watch *, const char **, unsigned int)) + const char *path2, struct xs_watch *watch, + xs_watch_cb_t *callback) { int error; char *state = malloc(strlen(path) + 1 + strlen(path2) + 1, - M_DEVBUF, M_WAITOK); + M_XENBUS, M_WAITOK); strcpy(state, path); strcat(state, "/"); @@ -101,46 +136,27 @@ xenbus_watch_path2(device_t dev, const char *path, error = xenbus_watch_path(dev, state, watch, callback); if (error) { - free(state, M_DEVBUF); + free(state,M_XENBUS); } return (error); } -/** - * Return the path to the error node for the given device, or NULL on failure. - * If the value returned is non-NULL, then it is the caller's to kfree. - */ -static char * -error_path(device_t dev) -{ - char *path_buffer = malloc(strlen("error/") - + strlen(xenbus_get_node(dev)) + 1, M_DEVBUF, M_WAITOK); - - strcpy(path_buffer, "error/"); - strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev)); - - return (path_buffer); -} - - -static void -_dev_error(device_t dev, int err, const char *fmt, va_list ap) +void +xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap) { int ret; unsigned int len; char *printf_buffer = NULL, *path_buffer = NULL; #define PRINTF_BUFFER_SIZE 4096 - printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK); + printf_buffer = malloc(PRINTF_BUFFER_SIZE,M_XENBUS, M_WAITOK); len = sprintf(printf_buffer, "%i ", err); ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); KASSERT(len + ret <= PRINTF_BUFFER_SIZE-1, ("xenbus error message too big")); -#if 0 - dev_err(&dev->dev, "%s\n", printf_buffer); -#endif + device_printf(dev, "Error %s\n", printf_buffer); path_buffer = error_path(dev); if (path_buffer == NULL) { @@ -149,7 +165,7 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap) goto fail; } - if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { + if (xs_write(XST_NIL, path_buffer, "error", printf_buffer) != 0) { printf("xenbus: failed to write error node for %s (%s)\n", xenbus_get_node(dev), printf_buffer); goto fail; @@ -157,9 +173,9 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap) fail: if (printf_buffer) - free(printf_buffer, M_DEVBUF); + free(printf_buffer,M_XENBUS); if (path_buffer) - free(path_buffer, M_DEVBUF); + free(path_buffer,M_XENBUS); } void @@ -168,41 +184,45 @@ xenbus_dev_error(device_t dev, int err, const char *fmt, ...) va_list ap; va_start(ap, fmt); - _dev_error(dev, err, fmt, ap); + xenbus_dev_verror(dev, err, fmt, ap); va_end(ap); } void +xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list ap) +{ + xenbus_dev_verror(dev, err, fmt, ap); + device_printf(dev, "Fatal error. Transitioning to Closing State\n"); + xenbus_set_state(dev, XenbusStateClosing); +} + +void xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - _dev_error(dev, err, fmt, ap); + xenbus_dev_vfatal(dev, err, fmt, ap); va_end(ap); - - xenbus_set_state(dev, XenbusStateClosing); } int -xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp) +xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp) { int error; - grant_ref_t ref; error = gnttab_grant_foreign_access( - xenbus_get_otherend_id(dev), ring_mfn, 0, &ref); + xenbus_get_otherend_id(dev), ring_mfn, 0, refp); if (error) { xenbus_dev_fatal(dev, error, "granting access to ring page"); return (error); } - *refp = ref; return (0); } int -xenbus_alloc_evtchn(device_t dev, int *port) +xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port) { struct evtchn_alloc_unbound alloc_unbound; int err; @@ -222,7 +242,7 @@ xenbus_alloc_evtchn(device_t dev, int *port) } int -xenbus_free_evtchn(device_t dev, int port) +xenbus_free_evtchn(device_t dev, evtchn_port_t port) { struct evtchn_close close; int err; @@ -240,12 +260,29 @@ xenbus_free_evtchn(device_t dev, int port) XenbusState xenbus_read_driver_state(const char *path) { - XenbusState result; + XenbusState result; + int error; + + error = xs_gather(XST_NIL, path, "state", "%d", &result, NULL); + if (error) + result = XenbusStateClosed; + + return (result); +} + +int +xenbus_dev_is_online(device_t dev) +{ + const char *path; int error; + int value; - error = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL); - if (error) - result = XenbusStateClosed; + path = xenbus_get_node(dev); + error = xs_gather(XST_NIL, path, "online", "%d", &value, NULL); + if (error != 0) { + /* Default to not online. */ + value = 0; + } - return (result); + return (value); } diff --git a/sys/xen/xenbus/xenbus_comms.c b/sys/xen/xenbus/xenbus_comms.c deleted file mode 100644 index 2f03955..0000000 --- a/sys/xen/xenbus/xenbus_comms.c +++ /dev/null @@ -1,226 +0,0 @@ -/****************************************************************************** - * xenbus_comms.c - * - * Low level code to talks to Xen Store: ringbuffer and event channel. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/bus.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/sx.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/syslog.h> - -#include <machine/xen/xen-os.h> -#include <xen/hypervisor.h> - -#include <xen/xen_intr.h> -#include <xen/evtchn.h> -#include <xen/interface/io/xs_wire.h> -#include <xen/xenbus/xenbus_comms.h> - -static unsigned int xenstore_irq; - -static inline struct xenstore_domain_interface * -xenstore_domain_interface(void) -{ - - return (struct xenstore_domain_interface *)xen_store; -} - -static void -xb_intr(void * arg __attribute__((unused))) -{ - - wakeup(xen_store); -} - -static int -xb_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) -{ - - return ((prod - cons) <= XENSTORE_RING_SIZE); -} - -static void * -xb_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, - char *buf, uint32_t *len) -{ - - *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); - if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) - *len = XENSTORE_RING_SIZE - (prod - cons); - return (buf + MASK_XENSTORE_IDX(prod)); -} - -static const void * -xb_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, - const char *buf, uint32_t *len) -{ - - *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); - if ((prod - cons) < *len) - *len = prod - cons; - return (buf + MASK_XENSTORE_IDX(cons)); -} - -int -xb_write(const void *tdata, unsigned len, struct lock_object *lock) -{ - struct xenstore_domain_interface *intf = xenstore_domain_interface(); - XENSTORE_RING_IDX cons, prod; - const char *data = (const char *)tdata; - int error; - - while (len != 0) { - void *dst; - unsigned int avail; - - while ((intf->req_prod - intf->req_cons) - == XENSTORE_RING_SIZE) { - error = _sleep(intf, - lock, - PCATCH, "xbwrite", hz/10); - if (error && error != EWOULDBLOCK) - return (error); - } - - /* Read indexes, then verify. */ - cons = intf->req_cons; - prod = intf->req_prod; - mb(); - if (!xb_check_indexes(cons, prod)) { - intf->req_cons = intf->req_prod = 0; - return (EIO); - } - - dst = xb_get_output_chunk(cons, prod, intf->req, &avail); - if (avail == 0) - continue; - if (avail > len) - avail = len; - mb(); - - memcpy(dst, data, avail); - data += avail; - len -= avail; - - /* Other side must not see new header until data is there. */ - wmb(); - intf->req_prod += avail; - - /* This implies mb() before other side sees interrupt. */ - notify_remote_via_evtchn(xen_store_evtchn); - } - - return (0); -} - -int -xb_read(void *tdata, unsigned len, struct lock_object *lock) -{ - struct xenstore_domain_interface *intf = xenstore_domain_interface(); - XENSTORE_RING_IDX cons, prod; - char *data = (char *)tdata; - int error; - - while (len != 0) { - unsigned int avail; - const char *src; - - while (intf->rsp_cons == intf->rsp_prod) { - error = _sleep(intf, lock, - PCATCH, "xbread", hz/10); - if (error && error != EWOULDBLOCK) - return (error); - } - - /* Read indexes, then verify. */ - cons = intf->rsp_cons; - prod = intf->rsp_prod; - if (!xb_check_indexes(cons, prod)) { - intf->rsp_cons = intf->rsp_prod = 0; - return (EIO); - } - - src = xb_get_input_chunk(cons, prod, intf->rsp, &avail); - if (avail == 0) - continue; - if (avail > len) - avail = len; - - /* We must read header before we read data. */ - rmb(); - - memcpy(data, src, avail); - data += avail; - len -= avail; - - /* Other side must not see free space until we've copied out */ - mb(); - intf->rsp_cons += avail; - - /* Implies mb(): they will see new header. */ - notify_remote_via_evtchn(xen_store_evtchn); - } - - return (0); -} - -/* Set up interrupt handler off store event channel. */ -int -xb_init_comms(void) -{ - struct xenstore_domain_interface *intf = xenstore_domain_interface(); - int error; - - if (intf->rsp_prod != intf->rsp_cons) { - log(LOG_WARNING, "XENBUS response ring is not quiescent " - "(%08x:%08x): fixing up\n", - intf->rsp_cons, intf->rsp_prod); - intf->rsp_cons = intf->rsp_prod; - } - - if (xenstore_irq) - unbind_from_irqhandler(xenstore_irq); - - error = bind_caller_port_to_irqhandler( - xen_store_evtchn, "xenbus", - xb_intr, NULL, INTR_TYPE_NET, &xenstore_irq); - if (error) { - log(LOG_WARNING, "XENBUS request irq failed %i\n", error); - return (error); - } - - return (0); -} diff --git a/sys/xen/xenbus/xenbus_comms.h b/sys/xen/xenbus/xenbus_comms.h deleted file mode 100644 index fa47331..0000000 --- a/sys/xen/xenbus/xenbus_comms.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Private include for xenbus communications. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * $FreeBSD$ - */ - -#ifndef _XENBUS_COMMS_H -#define _XENBUS_COMMS_H - -struct sx; -extern int xen_store_evtchn; -extern char *xen_store; - -int xs_init(void); -int xb_init_comms(void); - -/* Low level routines. */ -int xb_write(const void *data, unsigned len, struct lock_object *); -int xb_read(void *data, unsigned len, struct lock_object *); -extern int xenbus_running; - -char *kasprintf(const char *fmt, ...); - - -#endif /* _XENBUS_COMMS_H */ diff --git a/sys/xen/xenbus/xenbus_if.m b/sys/xen/xenbus/xenbus_if.m index 018a2bb..d671418 100644 --- a/sys/xen/xenbus/xenbus_if.m +++ b/sys/xen/xenbus/xenbus_if.m @@ -31,7 +31,15 @@ INTERFACE xenbus; -METHOD int backend_changed { - device_t dev; - enum xenbus_state newstate; +/** + * \brief Callback triggered when the state of the otherend + * of a split device changes. + * + * \param _dev NewBus device_t for this XenBus device whose otherend's + * state has changed.. + * \param _newstate The new state of the otherend device. + */ +METHOD int otherend_changed { + device_t _dev; + enum xenbus_state _newstate; }; diff --git a/sys/xen/xenbus/xenbus_probe.c b/sys/xen/xenbus/xenbus_probe.c deleted file mode 100644 index b1e9a21..0000000 --- a/sys/xen/xenbus/xenbus_probe.c +++ /dev/null @@ -1,602 +0,0 @@ -/****************************************************************************** - * Talks to Xen Store to figure out what devices we have. - * - * Copyright (C) 2008 Doug Rabson - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 Mike Wray, Hewlett-Packard - * Copyright (C) 2005 XenSource Ltd - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#if 0 -#define DPRINTK(fmt, args...) \ - printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) -#else -#define DPRINTK(fmt, args...) ((void)0) -#endif - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/bus.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/module.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/systm.h> -#include <sys/sx.h> -#include <sys/taskqueue.h> - -#include <machine/xen/xen-os.h> -#include <machine/stdarg.h> - -#include <xen/gnttab.h> -#include <xen/xenbus/xenbusvar.h> -#include <xen/xenbus/xenbus_comms.h> - -struct xenbus_softc { - struct xenbus_watch xs_devicewatch; - struct task xs_probechildren; - struct intr_config_hook xs_attachcb; - device_t xs_dev; -}; - -struct xenbus_device_ivars { - struct xenbus_watch xd_otherend_watch; /* must be first */ - struct sx xd_lock; - device_t xd_dev; - char *xd_node; /* node name in xenstore */ - char *xd_type; /* xen device type */ - enum xenbus_state xd_state; - int xd_otherend_id; - char *xd_otherend_path; -}; - -/* Simplified asprintf. */ -char * -kasprintf(const char *fmt, ...) -{ - va_list ap; - unsigned int len; - char *p, dummy[1]; - - va_start(ap, fmt); - /* FIXME: vsnprintf has a bug, NULL should work */ - len = vsnprintf(dummy, 0, fmt, ap); - va_end(ap); - - p = malloc(len + 1, M_DEVBUF, M_WAITOK); - va_start(ap, fmt); - vsprintf(p, fmt, ap); - va_end(ap); - return p; -} - -static void -xenbus_identify(driver_t *driver, device_t parent) -{ - - BUS_ADD_CHILD(parent, 0, "xenbus", 0); -} - -static int -xenbus_probe(device_t dev) -{ - int err = 0; - - DPRINTK(""); - - /* Initialize the interface to xenstore. */ - err = xs_init(); - if (err) { - log(LOG_WARNING, - "XENBUS: Error initializing xenstore comms: %i\n", err); - return (ENXIO); - } - err = gnttab_init(); - if (err) { - log(LOG_WARNING, - "XENBUS: Error initializing grant table: %i\n", err); - return (ENXIO); - } - device_set_desc(dev, "Xen Devices"); - - return (0); -} - -static enum xenbus_state -xenbus_otherend_state(struct xenbus_device_ivars *ivars) -{ - - return (xenbus_read_driver_state(ivars->xd_otherend_path)); -} - -static void -xenbus_backend_changed(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - struct xenbus_device_ivars *ivars; - device_t dev; - enum xenbus_state newstate; - - ivars = (struct xenbus_device_ivars *) watch; - dev = ivars->xd_dev; - - if (!ivars->xd_otherend_path - || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH], - strlen(ivars->xd_otherend_path))) - return; - - newstate = xenbus_otherend_state(ivars); - XENBUS_BACKEND_CHANGED(dev, newstate); -} - -static int -xenbus_device_exists(device_t dev, const char *node) -{ - device_t *kids; - struct xenbus_device_ivars *ivars; - int i, count, result; - - if (device_get_children(dev, &kids, &count)) - return (FALSE); - - result = FALSE; - for (i = 0; i < count; i++) { - ivars = device_get_ivars(kids[i]); - if (!strcmp(ivars->xd_node, node)) { - result = TRUE; - break; - } - } - free(kids, M_TEMP); - - return (result); -} - -static int -xenbus_add_device(device_t dev, const char *bus, - const char *type, const char *id) -{ - device_t child; - struct xenbus_device_ivars *ivars; - enum xenbus_state state; - char *statepath; - int error; - - ivars = malloc(sizeof(struct xenbus_device_ivars), - M_DEVBUF, M_ZERO|M_WAITOK); - ivars->xd_node = kasprintf("%s/%s/%s", bus, type, id); - - if (xenbus_device_exists(dev, ivars->xd_node)) { - /* - * We are already tracking this node - */ - free(ivars->xd_node, M_DEVBUF); - free(ivars, M_DEVBUF); - return (0); - } - - state = xenbus_read_driver_state(ivars->xd_node); - - if (state != XenbusStateInitialising) { - /* - * Device is not new, so ignore it. This can - * happen if a device is going away after - * switching to Closed. - */ - free(ivars->xd_node, M_DEVBUF); - free(ivars, M_DEVBUF); - return (0); - } - - /* - * Find the backend details - */ - error = xenbus_gather(XBT_NIL, ivars->xd_node, - "backend-id", "%i", &ivars->xd_otherend_id, - "backend", NULL, &ivars->xd_otherend_path, - NULL); - if (error) - return (error); - - sx_init(&ivars->xd_lock, "xdlock"); - ivars->xd_type = strdup(type, M_DEVBUF); - ivars->xd_state = XenbusStateInitialising; - - statepath = malloc(strlen(ivars->xd_otherend_path) - + strlen("/state") + 1, M_DEVBUF, M_WAITOK); - sprintf(statepath, "%s/state", ivars->xd_otherend_path); - - ivars->xd_otherend_watch.node = statepath; - ivars->xd_otherend_watch.callback = xenbus_backend_changed; - - child = device_add_child(dev, NULL, -1); - ivars->xd_dev = child; - device_set_ivars(child, ivars); - - return (0); -} - -static int -xenbus_enumerate_type(device_t dev, const char *bus, const char *type) -{ - char **dir; - unsigned int i, count; - int error; - - error = xenbus_directory(XBT_NIL, bus, type, &count, &dir); - if (error) - return (error); - for (i = 0; i < count; i++) - xenbus_add_device(dev, bus, type, dir[i]); - - free(dir, M_DEVBUF); - - return (0); -} - -static int -xenbus_enumerate_bus(device_t dev, const char *bus) -{ - char **dir; - unsigned int i, count; - int error; - - error = xenbus_directory(XBT_NIL, bus, "", &count, &dir); - if (error) - return (error); - for (i = 0; i < count; i++) { - xenbus_enumerate_type(dev, bus, dir[i]); - } - free(dir, M_DEVBUF); - - return (0); -} - -static int -xenbus_probe_children(device_t dev) -{ - device_t *kids; - struct xenbus_device_ivars *ivars; - int i, count; - - /* - * Probe any new devices and register watches for any that - * attach successfully. Since part of the protocol which - * establishes a connection with the other end is interrupt - * driven, we sleep until the device reaches a stable state - * (closed or connected). - */ - if (device_get_children(dev, &kids, &count) == 0) { - for (i = 0; i < count; i++) { - if (device_get_state(kids[i]) != DS_NOTPRESENT) - continue; - - if (device_probe_and_attach(kids[i])) - continue; - ivars = device_get_ivars(kids[i]); - register_xenbus_watch( - &ivars->xd_otherend_watch); - sx_xlock(&ivars->xd_lock); - while (ivars->xd_state != XenbusStateClosed - && ivars->xd_state != XenbusStateConnected) - sx_sleep(&ivars->xd_state, &ivars->xd_lock, - 0, "xdattach", 0); - sx_xunlock(&ivars->xd_lock); - } - free(kids, M_TEMP); - } - - return (0); -} - -static void -xenbus_probe_children_cb(void *arg, int pending) -{ - device_t dev = (device_t) arg; - - xenbus_probe_children(dev); -} - -static void -xenbus_devices_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - struct xenbus_softc *sc = (struct xenbus_softc *) watch; - device_t dev = sc->xs_dev; - char *node, *bus, *type, *id, *p; - - node = strdup(vec[XS_WATCH_PATH], M_DEVBUF); - p = strchr(node, '/'); - if (!p) - goto out; - bus = node; - *p = 0; - type = p + 1; - - p = strchr(type, '/'); - if (!p) - goto out; - *p = 0; - id = p + 1; - - p = strchr(id, '/'); - if (p) - *p = 0; - - xenbus_add_device(dev, bus, type, id); - taskqueue_enqueue(taskqueue_thread, &sc->xs_probechildren); -out: - free(node, M_DEVBUF); -} - -static void -xenbus_attach_deferred(void *arg) -{ - device_t dev = (device_t) arg; - struct xenbus_softc *sc = device_get_softc(dev); - int error; - - error = xenbus_enumerate_bus(dev, "device"); - if (error) - return; - xenbus_probe_children(dev); - - sc->xs_dev = dev; - sc->xs_devicewatch.node = "device"; - sc->xs_devicewatch.callback = xenbus_devices_changed; - - TASK_INIT(&sc->xs_probechildren, 0, xenbus_probe_children_cb, dev); - - register_xenbus_watch(&sc->xs_devicewatch); - - config_intrhook_disestablish(&sc->xs_attachcb); -} - -static int -xenbus_attach(device_t dev) -{ - struct xenbus_softc *sc = device_get_softc(dev); - - sc->xs_attachcb.ich_func = xenbus_attach_deferred; - sc->xs_attachcb.ich_arg = dev; - config_intrhook_establish(&sc->xs_attachcb); - - return (0); -} - -static int -xenbus_suspend(device_t dev) -{ - int error; - - DPRINTK(""); - - error = bus_generic_suspend(dev); - if (error) - return (error); - - xs_suspend(); - - return (0); -} - -static int -xenbus_resume(device_t dev) -{ - device_t *kids; - struct xenbus_device_ivars *ivars; - int i, count, error; - char *statepath; - - xb_init_comms(); - xs_resume(); - - /* - * We must re-examine each device and find the new path for - * its backend. - */ - if (device_get_children(dev, &kids, &count) == 0) { - for (i = 0; i < count; i++) { - if (device_get_state(kids[i]) == DS_NOTPRESENT) - continue; - - ivars = device_get_ivars(kids[i]); - - unregister_xenbus_watch( - &ivars->xd_otherend_watch); - ivars->xd_state = XenbusStateInitialising; - - /* - * Find the new backend details and - * re-register our watch. - */ - free(ivars->xd_otherend_path, M_DEVBUF); - error = xenbus_gather(XBT_NIL, ivars->xd_node, - "backend-id", "%i", &ivars->xd_otherend_id, - "backend", NULL, &ivars->xd_otherend_path, - NULL); - if (error) - return (error); - - DEVICE_RESUME(kids[i]); - - statepath = malloc(strlen(ivars->xd_otherend_path) - + strlen("/state") + 1, M_DEVBUF, M_WAITOK); - sprintf(statepath, "%s/state", ivars->xd_otherend_path); - - free(ivars->xd_otherend_watch.node, M_DEVBUF); - ivars->xd_otherend_watch.node = statepath; - register_xenbus_watch( - &ivars->xd_otherend_watch); - -#if 0 - /* - * Can't do this yet since we are running in - * the xenwatch thread and if we sleep here, - * we will stop delivering watch notifications - * and the device will never come back online. - */ - sx_xlock(&ivars->xd_lock); - while (ivars->xd_state != XenbusStateClosed - && ivars->xd_state != XenbusStateConnected) - sx_sleep(&ivars->xd_state, &ivars->xd_lock, - 0, "xdresume", 0); - sx_xunlock(&ivars->xd_lock); -#endif - } - free(kids, M_TEMP); - } - - return (0); -} - -static int -xenbus_print_child(device_t dev, device_t child) -{ - struct xenbus_device_ivars *ivars = device_get_ivars(child); - int retval = 0; - - retval += bus_print_child_header(dev, child); - retval += printf(" at %s", ivars->xd_node); - retval += bus_print_child_footer(dev, child); - - return (retval); -} - -static int -xenbus_read_ivar(device_t dev, device_t child, int index, - uintptr_t * result) -{ - struct xenbus_device_ivars *ivars = device_get_ivars(child); - - switch (index) { - case XENBUS_IVAR_NODE: - *result = (uintptr_t) ivars->xd_node; - return (0); - - case XENBUS_IVAR_TYPE: - *result = (uintptr_t) ivars->xd_type; - return (0); - - case XENBUS_IVAR_STATE: - *result = (uintptr_t) ivars->xd_state; - return (0); - - case XENBUS_IVAR_OTHEREND_ID: - *result = (uintptr_t) ivars->xd_otherend_id; - return (0); - - case XENBUS_IVAR_OTHEREND_PATH: - *result = (uintptr_t) ivars->xd_otherend_path; - return (0); - } - - return (ENOENT); -} - -static int -xenbus_write_ivar(device_t dev, device_t child, int index, uintptr_t value) -{ - struct xenbus_device_ivars *ivars = device_get_ivars(child); - enum xenbus_state newstate; - int currstate; - int error; - - switch (index) { - case XENBUS_IVAR_STATE: - newstate = (enum xenbus_state) value; - sx_xlock(&ivars->xd_lock); - if (ivars->xd_state == newstate) - goto out; - - error = xenbus_scanf(XBT_NIL, ivars->xd_node, "state", - NULL, "%d", &currstate); - if (error) - goto out; - - error = xenbus_printf(XBT_NIL, ivars->xd_node, "state", - "%d", newstate); - if (error) { - if (newstate != XenbusStateClosing) /* Avoid looping */ - xenbus_dev_fatal(dev, error, "writing new state"); - goto out; - } - ivars->xd_state = newstate; - wakeup(&ivars->xd_state); - out: - sx_xunlock(&ivars->xd_lock); - return (0); - - case XENBUS_IVAR_NODE: - case XENBUS_IVAR_TYPE: - case XENBUS_IVAR_OTHEREND_ID: - case XENBUS_IVAR_OTHEREND_PATH: - /* - * These variables are read-only. - */ - return (EINVAL); - } - - return (ENOENT); -} - -SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen"); -SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xen_store_evtchn, 0, ""); -SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, ""); - -static device_method_t xenbus_methods[] = { - /* Device interface */ - DEVMETHOD(device_identify, xenbus_identify), - DEVMETHOD(device_probe, xenbus_probe), - DEVMETHOD(device_attach, xenbus_attach), - DEVMETHOD(device_detach, bus_generic_detach), - DEVMETHOD(device_shutdown, bus_generic_shutdown), - DEVMETHOD(device_suspend, xenbus_suspend), - DEVMETHOD(device_resume, xenbus_resume), - - /* Bus interface */ - DEVMETHOD(bus_print_child, xenbus_print_child), - DEVMETHOD(bus_read_ivar, xenbus_read_ivar), - DEVMETHOD(bus_write_ivar, xenbus_write_ivar), - - { 0, 0 } -}; - -static char driver_name[] = "xenbus"; -static driver_t xenbus_driver = { - driver_name, - xenbus_methods, - sizeof(struct xenbus_softc), -}; -devclass_t xenbus_devclass; - -#ifdef XENHVM -DRIVER_MODULE(xenbus, xenpci, xenbus_driver, xenbus_devclass, 0, 0); -#else -DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0); -#endif diff --git a/sys/xen/xenbus/xenbus_probe_backend.c b/sys/xen/xenbus/xenbus_probe_backend.c deleted file mode 100644 index 20cc49f..0000000 --- a/sys/xen/xenbus/xenbus_probe_backend.c +++ /dev/null @@ -1,308 +0,0 @@ -/****************************************************************************** - * Talks to Xen Store to figure out what devices we have (backend half). - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 Mike Wray, Hewlett-Packard - * Copyright (C) 2005, 2006 XenSource Ltd - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#if 0 -#define DPRINTK(fmt, args...) \ - printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) -#else -#define DPRINTK(fmt, args...) ((void)0) -#endif - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/types.h> -#include <sys/cdefs.h> -#include <sys/time.h> -#include <sys/sema.h> -#include <sys/eventhandler.h> -#include <sys/errno.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/module.h> -#include <sys/conf.h> -#include <sys/systm.h> -#include <sys/syslog.h> -#include <sys/proc.h> -#include <sys/bus.h> -#include <sys/sx.h> - -#include <machine/xen/xen-os.h> -#include <xen/hypervisor.h> -#include <machine/xen/xenbus.h> -#include <machine/stdarg.h> - -#include <xen/evtchn.h> -#include <xen/xenbus/xenbus_comms.h> - -#define BUG_ON PANIC_IF -#define semaphore sema -#define rw_semaphore sema -#define DEFINE_SPINLOCK(lock) struct mtx lock -#define DECLARE_MUTEX(lock) struct sema lock -#define u32 uint32_t -#define list_del(head, ent) TAILQ_REMOVE(head, ent, list) -#define simple_strtoul strtoul -#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) -#define list_empty TAILQ_EMPTY - -extern struct xendev_list_head xenbus_device_backend_list; -#if 0 -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); -#endif -static int xenbus_probe_backend(const char *type, const char *domid); - -static int read_frontend_details(struct xenbus_device *xendev) -{ - return read_otherend_details(xendev, "frontend-id", "frontend"); -} - -/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ -static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) -{ - int domid, err; - const char *devid, *type, *frontend; - unsigned int typelen; - - type = strchr(nodename, '/'); - if (!type) - return -EINVAL; - type++; - typelen = strcspn(type, "/"); - if (!typelen || type[typelen] != '/') - return -EINVAL; - - devid = strrchr(nodename, '/') + 1; - - err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, - "frontend", NULL, &frontend, - NULL); - if (err) - return err; - if (strlen(frontend) == 0) - err = -ERANGE; - if (!err && !xenbus_exists(XBT_NIL, frontend, "")) - err = -ENOENT; - kfree(frontend); - - if (err) - return err; - - if (snprintf(bus_id, BUS_ID_SIZE, - "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) - return -ENOSPC; - return 0; -} - -static struct xen_bus_type xenbus_backend = { - .root = "backend", - .levels = 3, /* backend/type/<frontend>/<id> */ - .get_bus_id = backend_bus_id, - .probe = xenbus_probe_backend, - .bus = &xenbus_device_backend_list, - -#if 0 - .error = -ENODEV, - .bus = { - .name = "xen-backend", - .match = xenbus_match, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, -// .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_backend, - }, - .dev = { - .bus_id = "xen-backend", - }, -#endif -}; - -#if 0 -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) -{ - struct xenbus_device *xdev; - struct xenbus_driver *drv; - int i = 0; - int length = 0; - - DPRINTK(""); - - if (dev == NULL) - return -ENODEV; - - xdev = to_xenbus_device(dev); - if (xdev == NULL) - return -ENODEV; -2 - /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_BASE_PATH=%s", xenbus_backend.root); - - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; - - if (dev->driver) { - drv = to_xenbus_driver(dev->driver); - if (drv && drv->uevent) - return drv->uevent(xdev, envp, num_envp, buffer, - buffer_size); - } - - return 0; -} -#endif - -int xenbus_register_backend(struct xenbus_driver *drv) -{ - drv->read_otherend_details = read_frontend_details; - - return xenbus_register_driver_common(drv, &xenbus_backend); -} - -/* backend/<typename>/<frontend-uuid>/<name> */ -static int xenbus_probe_backend_unit(const char *dir, - const char *type, - const char *name) -{ - char *nodename; - int err; - - nodename = kasprintf("%s/%s", dir, name); - if (!nodename) - return -ENOMEM; - - DPRINTK("%s\n", nodename); - - err = xenbus_probe_node(&xenbus_backend, type, nodename); - kfree(nodename); - return err; -} - -/* backend/<typename>/<frontend-domid> */ -static int xenbus_probe_backend(const char *type, const char *domid) -{ - char *nodename; - int err = 0; - char **dir; - unsigned int i, dir_n = 0; - - DPRINTK(""); - - nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid); - if (!nodename) - return -ENOMEM; - - dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); - if (IS_ERR(dir)) { - kfree(nodename); - return PTR_ERR(dir); - } - - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_backend_unit(nodename, type, dir[i]); - if (err) - break; - } - kfree(dir); - kfree(nodename); - return err; -} - -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - DPRINTK(""); - - dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); -} - -static struct xenbus_watch be_watch = { - .node = "backend", - .callback = backend_changed, -}; -#if 0 -void xenbus_backend_suspend(int (*fn)(struct device *, void *)) -{ - DPRINTK(""); - if (!xenbus_backend.error) - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); -} - -void xenbus_backend_resume(int (*fn)(struct device *, void *)) -{ - DPRINTK(""); - if (!xenbus_backend.error) - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); -} -#endif -void xenbus_backend_probe_and_watch(void) -{ - xenbus_probe_devices(&xenbus_backend); - register_xenbus_watch(&be_watch); -} - -#if 0 -void xenbus_backend_bus_register(void) -{ - xenbus_backend.error = bus_register(&xenbus_backend.bus); - if (xenbus_backend.error) - log(LOG_WARNING, - "XENBUS: Error registering backend bus: %i\n", - xenbus_backend.error); -} - -void xenbus_backend_device_register(void) -{ - if (xenbus_backend.error) - return; - - xenbus_backend.error = device_register(&xenbus_backend.dev); - if (xenbus_backend.error) { - bus_unregister(&xenbus_backend.bus); - log(LOG_WARNING, - "XENBUS: Error registering backend device: %i\n", - xenbus_backend.error); - } -} -#endif diff --git a/sys/xen/xenbus/xenbus_xs.c b/sys/xen/xenbus/xenbus_xs.c deleted file mode 100644 index 9312255..0000000 --- a/sys/xen/xenbus/xenbus_xs.c +++ /dev/null @@ -1,935 +0,0 @@ -/****************************************************************************** - * xenbus_xs.c - * - * This is the kernel equivalent of the "xs" library. We don't need everything - * and we use xenbus_comms for communication. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/uio.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/sx.h> -#include <sys/syslog.h> -#include <sys/malloc.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/kthread.h> -#include <sys/unistd.h> - -#include <machine/xen/xen-os.h> -#include <xen/hypervisor.h> -#include <machine/stdarg.h> - -#include <xen/xenbus/xenbusvar.h> -#include <xen/xenbus/xenbus_comms.h> -#include <xen/interface/hvm/params.h> - -#include <vm/vm.h> -#include <vm/pmap.h> - -static int xs_process_msg(enum xsd_sockmsg_type *type); - -int xenwatch_running = 0; -int xenbus_running = 0; -int xen_store_evtchn; - -struct xs_stored_msg { - TAILQ_ENTRY(xs_stored_msg) list; - - struct xsd_sockmsg hdr; - - union { - /* Queued replies. */ - struct { - char *body; - } reply; - - /* Queued watch events. */ - struct { - struct xenbus_watch *handle; - char **vec; - unsigned int vec_size; - } watch; - } u; -}; - -struct xs_handle { - /* A list of replies. Currently only one will ever be outstanding. */ - TAILQ_HEAD(xs_handle_list, xs_stored_msg) reply_list; - struct mtx reply_lock; - int reply_waitq; - - /* One request at a time. */ - struct sx request_mutex; - - /* Protect transactions against save/restore. */ - struct sx suspend_mutex; -}; - -static struct xs_handle xs_state; - -/* List of registered watches, and a lock to protect it. */ -static LIST_HEAD(watch_list_head, xenbus_watch) watches; -static struct mtx watches_lock; -/* List of pending watch callback events, and a lock to protect it. */ -static TAILQ_HEAD(event_list_head, xs_stored_msg) watch_events; -static struct mtx watch_events_lock; - -/* - * Details of the xenwatch callback kernel thread. The thread waits on the - * watch_events_waitq for work to do (queued on watch_events list). When it - * wakes up it acquires the xenwatch_mutex before reading the list and - * carrying out work. - */ -static pid_t xenwatch_pid; -struct sx xenwatch_mutex; -static int watch_events_waitq; - -#define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0])) - -static int -xs_get_error(const char *errorstring) -{ - unsigned int i; - - for (i = 0; i < xsd_error_count; i++) { - if (!strcmp(errorstring, xsd_errors[i].errstring)) - return (xsd_errors[i].errnum); - } - log(LOG_WARNING, "XENBUS xen store gave: unknown error %s", - errorstring); - return (EINVAL); -} - -extern void kdb_backtrace(void); - -static int -xs_read_reply(enum xsd_sockmsg_type *type, unsigned int *len, void **result) -{ - struct xs_stored_msg *msg; - char *body; - int error; - - mtx_lock(&xs_state.reply_lock); - - while (TAILQ_EMPTY(&xs_state.reply_list)) { - while (TAILQ_EMPTY(&xs_state.reply_list)) { - error = mtx_sleep(&xs_state.reply_waitq, - &xs_state.reply_lock, - PCATCH, "xswait", hz/10); - if (error && error != EWOULDBLOCK) { - mtx_unlock(&xs_state.reply_lock); - return (error); - } - } - } - - msg = TAILQ_FIRST(&xs_state.reply_list); - TAILQ_REMOVE(&xs_state.reply_list, msg, list); - - mtx_unlock(&xs_state.reply_lock); - - *type = msg->hdr.type; - if (len) - *len = msg->hdr.len; - body = msg->u.reply.body; - - free(msg, M_DEVBUF); - *result = body; - return (0); -} - -#if 0 -/* Emergency write. UNUSED*/ -void xenbus_debug_write(const char *str, unsigned int count) -{ - struct xsd_sockmsg msg = { 0 }; - - msg.type = XS_DEBUG; - msg.len = sizeof("print") + count + 1; - - sx_xlock(&xs_state.request_mutex); - xb_write(&msg, sizeof(msg)); - xb_write("print", sizeof("print")); - xb_write(str, count); - xb_write("", 1); - sx_xunlock(&xs_state.request_mutex); -} - -#endif - -int -xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result) -{ - struct xsd_sockmsg req_msg = *msg; - int error; - - if (req_msg.type == XS_TRANSACTION_START) - sx_slock(&xs_state.suspend_mutex); - - sx_xlock(&xs_state.request_mutex); - - error = xb_write(msg, sizeof(*msg) + msg->len, - &xs_state.request_mutex.lock_object); - if (error) { - msg->type = XS_ERROR; - } else { - error = xs_read_reply(&msg->type, &msg->len, result); - } - - sx_xunlock(&xs_state.request_mutex); - - if ((msg->type == XS_TRANSACTION_END) || - ((req_msg.type == XS_TRANSACTION_START) && - (msg->type == XS_ERROR))) - sx_sunlock(&xs_state.suspend_mutex); - - return (error); -} - -/* - * Send message to xs. The reply is returned in *result and should be - * fred with free(*result, M_DEVBUF). Return zero on success or an - * error code on failure. - */ -static int -xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type, - const struct iovec *iovec, unsigned int num_vecs, - unsigned int *len, void **result) -{ - struct xsd_sockmsg msg; - void *ret = NULL; - unsigned int i; - int error; - - msg.tx_id = t.id; - msg.req_id = 0; - msg.type = type; - msg.len = 0; - for (i = 0; i < num_vecs; i++) - msg.len += iovec[i].iov_len; - - sx_xlock(&xs_state.request_mutex); - - error = xb_write(&msg, sizeof(msg), - &xs_state.request_mutex.lock_object); - if (error) { - sx_xunlock(&xs_state.request_mutex); - printf("xs_talkv failed %d\n", error); - return (error); - } - - for (i = 0; i < num_vecs; i++) { - error = xb_write(iovec[i].iov_base, iovec[i].iov_len, - &xs_state.request_mutex.lock_object); - if (error) { - sx_xunlock(&xs_state.request_mutex); - printf("xs_talkv failed %d\n", error); - return (error); - } - } - - error = xs_read_reply(&msg.type, len, &ret); - - sx_xunlock(&xs_state.request_mutex); - - if (error) - return (error); - - if (msg.type == XS_ERROR) { - error = xs_get_error(ret); - free(ret, M_DEVBUF); - return (error); - } - -#if 0 - if ((xenwatch_running == 0) && (xenwatch_inline == 0)) { - xenwatch_inline = 1; - while (!TAILQ_EMPTY(&watch_events) - && xenwatch_running == 0) { - - struct xs_stored_msg *wmsg = TAILQ_FIRST(&watch_events); - TAILQ_REMOVE(&watch_events, wmsg, list); - - wmsg->u.watch.handle->callback( - wmsg->u.watch.handle, - (const char **)wmsg->u.watch.vec, - wmsg->u.watch.vec_size); - free(wmsg->u.watch.vec, M_DEVBUF); - free(wmsg, M_DEVBUF); - } - xenwatch_inline = 0; - } -#endif - KASSERT(msg.type == type, ("bad xenstore message type")); - - if (result) - *result = ret; - else - free(ret, M_DEVBUF); - - return (0); -} - -/* Simplified version of xs_talkv: single message. */ -static int -xs_single(struct xenbus_transaction t, enum xsd_sockmsg_type type, - const char *string, unsigned int *len, void **result) -{ - struct iovec iovec; - - iovec.iov_base = (void *)(uintptr_t) string; - iovec.iov_len = strlen(string) + 1; - - return (xs_talkv(t, type, &iovec, 1, len, result)); -} - -static unsigned int -count_strings(const char *strings, unsigned int len) -{ - unsigned int num; - const char *p; - - for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) - num++; - - return num; -} - -/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */ -static char * -join(const char *dir, const char *name) -{ - char *buffer; - - buffer = malloc(strlen(dir) + strlen("/") + strlen(name) + 1, - M_DEVBUF, M_WAITOK); - - strcpy(buffer, dir); - if (strcmp(name, "")) { - strcat(buffer, "/"); - strcat(buffer, name); - } - - return (buffer); -} - -static char ** -split(char *strings, unsigned int len, unsigned int *num) -{ - char *p, **ret; - - /* Count the strings. */ - *num = count_strings(strings, len) + 1; - - /* Transfer to one big alloc for easy freeing. */ - ret = malloc(*num * sizeof(char *) + len, M_DEVBUF, M_WAITOK); - memcpy(&ret[*num], strings, len); - free(strings, M_DEVBUF); - - strings = (char *)&ret[*num]; - for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1) - ret[(*num)++] = p; - - ret[*num] = strings + len; - - return ret; -} - -/* - * Return the contents of a directory in *result which should be freed - * with free(*result, M_DEVBUF). - */ -int -xenbus_directory(struct xenbus_transaction t, const char *dir, - const char *node, unsigned int *num, char ***result) -{ - char *strings, *path; - unsigned int len = 0; - int error; - - path = join(dir, node); - error = xs_single(t, XS_DIRECTORY, path, &len, (void **) &strings); - free(path, M_DEVBUF); - if (error) - return (error); - - *result = split(strings, len, num); - return (0); -} - -/* - * Check if a path exists. Return 1 if it does. - */ -int -xenbus_exists(struct xenbus_transaction t, const char *dir, const char *node) -{ - char **d; - int error, dir_n; - - error = xenbus_directory(t, dir, node, &dir_n, &d); - if (error) - return (0); - free(d, M_DEVBUF); - return (1); -} - -/* - * Get the value of a single file. Returns the contents in *result - * which should be freed with free(*result, M_DEVBUF) after use. - * The length of the value in bytes is returned in *len. - */ -int -xenbus_read(struct xenbus_transaction t, const char *dir, const char *node, - unsigned int *len, void **result) -{ - char *path; - void *ret; - int error; - - path = join(dir, node); - error = xs_single(t, XS_READ, path, len, &ret); - free(path, M_DEVBUF); - if (error) - return (error); - *result = ret; - return (0); -} - -/* - * Write the value of a single file. Returns error on failure. - */ -int -xenbus_write(struct xenbus_transaction t, const char *dir, const char *node, - const char *string) -{ - char *path; - struct iovec iovec[2]; - int error; - - path = join(dir, node); - - iovec[0].iov_base = (void *)(uintptr_t) path; - iovec[0].iov_len = strlen(path) + 1; - iovec[1].iov_base = (void *)(uintptr_t) string; - iovec[1].iov_len = strlen(string); - - error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL); - free(path, M_DEVBUF); - - return (error); -} - -/* - * Create a new directory. - */ -int -xenbus_mkdir(struct xenbus_transaction t, const char *dir, const char *node) -{ - char *path; - int ret; - - path = join(dir, node); - ret = xs_single(t, XS_MKDIR, path, NULL, NULL); - free(path, M_DEVBUF); - - return (ret); -} - -/* - * Destroy a file or directory (directories must be empty). - */ -int -xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) -{ - char *path; - int ret; - - path = join(dir, node); - ret = xs_single(t, XS_RM, path, NULL, NULL); - free(path, M_DEVBUF); - - return (ret); -} - -/* - * Start a transaction: changes by others will not be seen during this - * transaction, and changes will not be visible to others until end. - */ -int -xenbus_transaction_start(struct xenbus_transaction *t) -{ - char *id_str; - int error; - - sx_slock(&xs_state.suspend_mutex); - error = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL, - (void **) &id_str); - if (error) { - sx_sunlock(&xs_state.suspend_mutex); - return (error); - } - - t->id = strtoul(id_str, NULL, 0); - free(id_str, M_DEVBUF); - - return (0); -} - -/* - * End a transaction. If abandon is true, transaction is discarded - * instead of committed. - */ -int xenbus_transaction_end(struct xenbus_transaction t, int abort) -{ - char abortstr[2]; - int error; - - if (abort) - strcpy(abortstr, "F"); - else - strcpy(abortstr, "T"); - - error = xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL); - - sx_sunlock(&xs_state.suspend_mutex); - - return (error); -} - -/* Single read and scanf: returns zero or errno. */ -int -xenbus_scanf(struct xenbus_transaction t, - const char *dir, const char *node, int *scancountp, const char *fmt, ...) -{ - va_list ap; - int error, ns; - char *val; - - error = xenbus_read(t, dir, node, NULL, (void **) &val); - if (error) - return (error); - - va_start(ap, fmt); - ns = vsscanf(val, fmt, ap); - va_end(ap); - free(val, M_DEVBUF); - /* Distinctive errno. */ - if (ns == 0) - return (ERANGE); - if (scancountp) - *scancountp = ns; - return (0); -} - -/* Single printf and write: returns zero or errno. */ -int -xenbus_printf(struct xenbus_transaction t, - const char *dir, const char *node, const char *fmt, ...) -{ - va_list ap; - int error, ret; -#define PRINTF_BUFFER_SIZE 4096 - char *printf_buffer; - - printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK); - - va_start(ap, fmt); - ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap); - va_end(ap); - - KASSERT(ret <= PRINTF_BUFFER_SIZE-1, ("xenbus_printf: message too large")); - error = xenbus_write(t, dir, node, printf_buffer); - - free(printf_buffer, M_DEVBUF); - - return (error); -} - -/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ -int -xenbus_gather(struct xenbus_transaction t, const char *dir, ...) -{ - va_list ap; - const char *name; - int error, i; - - for (i = 0; i < 10000; i++) - HYPERVISOR_yield(); - - va_start(ap, dir); - error = 0; - while (error == 0 && (name = va_arg(ap, char *)) != NULL) { - const char *fmt = va_arg(ap, char *); - void *result = va_arg(ap, void *); - char *p; - - error = xenbus_read(t, dir, name, NULL, (void **) &p); - if (error) - break; - - if (fmt) { - if (sscanf(p, fmt, result) == 0) - error = EINVAL; - free(p, M_DEVBUF); - } else - *(char **)result = p; - } - va_end(ap); - - return (error); -} - -static int -xs_watch(const char *path, const char *token) -{ - struct iovec iov[2]; - - iov[0].iov_base = (void *)(uintptr_t) path; - iov[0].iov_len = strlen(path) + 1; - iov[1].iov_base = (void *)(uintptr_t) token; - iov[1].iov_len = strlen(token) + 1; - - return (xs_talkv(XBT_NIL, XS_WATCH, iov, 2, NULL, NULL)); -} - -static int -xs_unwatch(const char *path, const char *token) -{ - struct iovec iov[2]; - - iov[0].iov_base = (void *)(uintptr_t) path; - iov[0].iov_len = strlen(path) + 1; - iov[1].iov_base = (void *)(uintptr_t) token; - iov[1].iov_len = strlen(token) + 1; - - return (xs_talkv(XBT_NIL, XS_UNWATCH, iov, 2, NULL, NULL)); -} - -static struct xenbus_watch * -find_watch(const char *token) -{ - struct xenbus_watch *i, *cmp; - - cmp = (void *)strtoul(token, NULL, 16); - - LIST_FOREACH(i, &watches, list) - if (i == cmp) - return (i); - - return (NULL); -} - -/* Register callback to watch this node. */ -int -register_xenbus_watch(struct xenbus_watch *watch) -{ - /* Pointer in ascii is the token. */ - char token[sizeof(watch) * 2 + 1]; - int error; - - sprintf(token, "%lX", (long)watch); - - sx_slock(&xs_state.suspend_mutex); - - mtx_lock(&watches_lock); - KASSERT(find_watch(token) == NULL, ("watch already registered")); - LIST_INSERT_HEAD(&watches, watch, list); - mtx_unlock(&watches_lock); - - error = xs_watch(watch->node, token); - - /* Ignore errors due to multiple registration. */ - if (error == EEXIST) { - mtx_lock(&watches_lock); - LIST_REMOVE(watch, list); - mtx_unlock(&watches_lock); - } - - sx_sunlock(&xs_state.suspend_mutex); - - return (error); -} - -void -unregister_xenbus_watch(struct xenbus_watch *watch) -{ - struct xs_stored_msg *msg, *tmp; - char token[sizeof(watch) * 2 + 1]; - int error; - - sprintf(token, "%lX", (long)watch); - - sx_slock(&xs_state.suspend_mutex); - - mtx_lock(&watches_lock); - KASSERT(find_watch(token), ("watch not registered")); - LIST_REMOVE(watch, list); - mtx_unlock(&watches_lock); - - error = xs_unwatch(watch->node, token); - if (error) - log(LOG_WARNING, "XENBUS Failed to release watch %s: %i\n", - watch->node, error); - - sx_sunlock(&xs_state.suspend_mutex); - - /* Cancel pending watch events. */ - mtx_lock(&watch_events_lock); - TAILQ_FOREACH_SAFE(msg, &watch_events, list, tmp) { - if (msg->u.watch.handle != watch) - continue; - TAILQ_REMOVE(&watch_events, msg, list); - free(msg->u.watch.vec, M_DEVBUF); - free(msg, M_DEVBUF); - } - mtx_unlock(&watch_events_lock); - - /* Flush any currently-executing callback, unless we are it. :-) */ - if (curproc->p_pid != xenwatch_pid) { - sx_xlock(&xenwatch_mutex); - sx_xunlock(&xenwatch_mutex); - } -} - -void -xs_suspend(void) -{ - - sx_xlock(&xs_state.suspend_mutex); - sx_xlock(&xs_state.request_mutex); -} - -void -xs_resume(void) -{ - struct xenbus_watch *watch; - char token[sizeof(watch) * 2 + 1]; - - sx_xunlock(&xs_state.request_mutex); - - /* No need for watches_lock: the suspend_mutex is sufficient. */ - LIST_FOREACH(watch, &watches, list) { - sprintf(token, "%lX", (long)watch); - xs_watch(watch->node, token); - } - - sx_xunlock(&xs_state.suspend_mutex); -} - -static void -xenwatch_thread(void *unused) -{ - struct xs_stored_msg *msg; - - for (;;) { - - mtx_lock(&watch_events_lock); - while (TAILQ_EMPTY(&watch_events)) - mtx_sleep(&watch_events_waitq, - &watch_events_lock, - PWAIT | PCATCH, "waitev", hz/10); - - mtx_unlock(&watch_events_lock); - sx_xlock(&xenwatch_mutex); - - mtx_lock(&watch_events_lock); - msg = TAILQ_FIRST(&watch_events); - if (msg) - TAILQ_REMOVE(&watch_events, msg, list); - mtx_unlock(&watch_events_lock); - - if (msg != NULL) { - /* - * XXX There are messages coming in with a NULL callback. - * XXX This deserves further investigation; the workaround - * XXX here simply prevents the kernel from panic'ing - * XXX on startup. - */ - if (msg->u.watch.handle->callback != NULL) - msg->u.watch.handle->callback( - msg->u.watch.handle, - (const char **)msg->u.watch.vec, - msg->u.watch.vec_size); - free(msg->u.watch.vec, M_DEVBUF); - free(msg, M_DEVBUF); - } - - sx_xunlock(&xenwatch_mutex); - } -} - -static int -xs_process_msg(enum xsd_sockmsg_type *type) -{ - struct xs_stored_msg *msg; - char *body; - int error; - - msg = malloc(sizeof(*msg), M_DEVBUF, M_WAITOK); - mtx_lock(&xs_state.reply_lock); - error = xb_read(&msg->hdr, sizeof(msg->hdr), - &xs_state.reply_lock.lock_object); - mtx_unlock(&xs_state.reply_lock); - if (error) { - free(msg, M_DEVBUF); - return (error); - } - - body = malloc(msg->hdr.len + 1, M_DEVBUF, M_WAITOK); - mtx_lock(&xs_state.reply_lock); - error = xb_read(body, msg->hdr.len, - &xs_state.reply_lock.lock_object); - mtx_unlock(&xs_state.reply_lock); - if (error) { - free(body, M_DEVBUF); - free(msg, M_DEVBUF); - return (error); - } - body[msg->hdr.len] = '\0'; - - *type = msg->hdr.type; - if (msg->hdr.type == XS_WATCH_EVENT) { - msg->u.watch.vec = split(body, msg->hdr.len, - &msg->u.watch.vec_size); - - mtx_lock(&watches_lock); - msg->u.watch.handle = find_watch( - msg->u.watch.vec[XS_WATCH_TOKEN]); - if (msg->u.watch.handle != NULL) { - mtx_lock(&watch_events_lock); - TAILQ_INSERT_TAIL(&watch_events, msg, list); - wakeup(&watch_events_waitq); - mtx_unlock(&watch_events_lock); - } else { - free(msg->u.watch.vec, M_DEVBUF); - free(msg, M_DEVBUF); - } - mtx_unlock(&watches_lock); - } else { - msg->u.reply.body = body; - mtx_lock(&xs_state.reply_lock); - TAILQ_INSERT_TAIL(&xs_state.reply_list, msg, list); - wakeup(&xs_state.reply_waitq); - mtx_unlock(&xs_state.reply_lock); - } - - return 0; -} - -static void -xenbus_thread(void *unused) -{ - int error; - enum xsd_sockmsg_type type; - xenbus_running = 1; - - for (;;) { - error = xs_process_msg(&type); - if (error) - printf("XENBUS error %d while reading message\n", - error); - } -} - -#ifdef XENHVM -static unsigned long xen_store_mfn; -char *xen_store; - -static inline unsigned long -hvm_get_parameter(int index) -{ - struct xen_hvm_param xhv; - int error; - - xhv.domid = DOMID_SELF; - xhv.index = index; - error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); - if (error) { - printf("hvm_get_parameter: failed to get %d, error %d\n", - index, error); - return (0); - } - return (xhv.value); -} - -#endif - -int -xs_init(void) -{ - int error; - struct proc *p; - -#ifdef XENHVM - xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN); - xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN); - xen_store = pmap_mapdev(xen_store_mfn * PAGE_SIZE, PAGE_SIZE); -#else - xen_store_evtchn = xen_start_info->store_evtchn; -#endif - - TAILQ_INIT(&xs_state.reply_list); - TAILQ_INIT(&watch_events); - sx_init(&xenwatch_mutex, "xenwatch"); - - - mtx_init(&xs_state.reply_lock, "state reply", NULL, MTX_DEF); - sx_init(&xs_state.request_mutex, "xenstore request"); - sx_init(&xs_state.suspend_mutex, "xenstore suspend"); - - -#if 0 - mtx_init(&xs_state.suspend_mutex, "xenstore suspend", NULL, MTX_DEF); - sema_init(&xs_state.request_mutex, 1, "xenstore request"); - sema_init(&xenwatch_mutex, 1, "xenwatch"); -#endif - mtx_init(&watches_lock, "watches", NULL, MTX_DEF); - mtx_init(&watch_events_lock, "watch events", NULL, MTX_DEF); - - /* Initialize the shared memory rings to talk to xenstored */ - error = xb_init_comms(); - if (error) - return (error); - - xenwatch_running = 1; - error = kproc_create(xenwatch_thread, NULL, &p, - RFHIGHPID, 0, "xenwatch"); - if (error) - return (error); - xenwatch_pid = p->p_pid; - - error = kproc_create(xenbus_thread, NULL, NULL, - RFHIGHPID, 0, "xenbus"); - - return (error); -} diff --git a/sys/xen/xenbus/xenbusb.c b/sys/xen/xenbus/xenbusb.c new file mode 100644 index 0000000..49facb6 --- /dev/null +++ b/sys/xen/xenbus/xenbusb.c @@ -0,0 +1,878 @@ +/****************************************************************************** + * Copyright (C) 2010 Spectra Logic Corporation + * Copyright (C) 2008 Doug Rabson + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005 XenSource Ltd + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * \file xenbusb.c + * + * \brief Shared support functions for managing the NewBus busses that contain + * Xen front and back end device instances. + * + * The NewBus implementation of XenBus attaches a xenbusb_front and xenbusb_back + * child bus to the xenstore device. This strategy allows the small differences + * in the handling of XenBus operations for front and back devices to be handled + * as overrides in xenbusb_front/back.c. Front and back specific device + * classes are also provided so device drivers can register for the devices they + * can handle without the need to filter within their probe routines. The + * net result is a device hierarchy that might look like this: + * + * xenstore0/ + * xenbusb_front0/ + * xn0 + * xbd0 + * xbd1 + * xenbusb_back0/ + * xbbd0 + * xnb0 + * xnb1 + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> + +#include <machine/xen/xen-os.h> +#include <machine/stdarg.h> + +#include <xen/gnttab.h> +#include <xen/xenstore/xenstorevar.h> +#include <xen/xenbus/xenbusb.h> +#include <xen/xenbus/xenbusvar.h> + +/*------------------------- Private Functions --------------------------------*/ +/** + * \brief Deallocate XenBus device instance variables. + * + * \param ivars The instance variable block to free. + */ +static void +xenbusb_free_child_ivars(struct xenbus_device_ivars *ivars) +{ + if (ivars->xd_otherend_watch.node != NULL) { + xs_unregister_watch(&ivars->xd_otherend_watch); + free(ivars->xd_otherend_watch.node, M_XENBUS); + ivars->xd_otherend_watch.node = NULL; + } + + if (ivars->xd_node != NULL) { + free(ivars->xd_node, M_XENBUS); + ivars->xd_node = NULL; + } + + if (ivars->xd_type != NULL) { + free(ivars->xd_type, M_XENBUS); + ivars->xd_type = NULL; + } + + if (ivars->xd_otherend_path != NULL) { + free(ivars->xd_otherend_path, M_XENBUS); + ivars->xd_otherend_path = NULL; + } + + free(ivars, M_XENBUS); +} + +/** + * XenBus watch callback registered against the "state" XenStore + * node of the other-end of a split device connection. + * + * This callback is invoked whenever the state of a device instance's + * peer changes. + * + * \param watch The xs_watch object used to register this callback + * function. + * \param vec An array of pointers to NUL terminated strings containing + * watch event data. The vector should be indexed via the + * xs_watch_type enum in xs_wire.h. + * \param vec_size The number of elements in vec. + * + * \return The device_t of the found device if any, or NULL. + * + * \note device_t is a pointer type, so it can be compared against + * NULL for validity. + */ +static void +xenbusb_otherend_changed(struct xs_watch *watch, const char **vec, + unsigned int vec_size __unused) +{ + struct xenbus_device_ivars *ivars; + device_t dev; + enum xenbus_state newstate; + + ivars = (struct xenbus_device_ivars *) watch; + dev = ivars->xd_dev; + + if (!ivars->xd_otherend_path + || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH], + strlen(ivars->xd_otherend_path))) + return; + + newstate = xenbus_read_driver_state(ivars->xd_otherend_path); + XENBUS_OTHEREND_CHANGED(dev, newstate); +} + +/** + * Search our internal record of configured devices (not the XenStore) + * to determine if the XenBus device indicated by \a node is known to + * the system. + * + * \param dev The XenBus bus instance to search for device children. + * \param node The XenStore node path for the device to find. + * + * \return The device_t of the found device if any, or NULL. + * + * \note device_t is a pointer type, so it can be compared against + * NULL for validity. + */ +static device_t +xenbusb_device_exists(device_t dev, const char *node) +{ + device_t *kids; + device_t result; + struct xenbus_device_ivars *ivars; + int i, count; + + if (device_get_children(dev, &kids, &count)) + return (FALSE); + + result = NULL; + for (i = 0; i < count; i++) { + ivars = device_get_ivars(kids[i]); + if (!strcmp(ivars->xd_node, node)) { + result = kids[i]; + break; + } + } + free(kids, M_TEMP); + + return (result); +} + +static void +xenbusb_delete_child(device_t dev, device_t child) +{ + struct xenbus_device_ivars *ivars; + + ivars = device_get_ivars(child); + + /* + * We no longer care about the otherend of the + * connection. Cancel the watch now so that we + * don't try to handle an event for a partially + * detached child. + */ + if (ivars->xd_otherend_watch.node != NULL) + xs_unregister_watch(&ivars->xd_otherend_watch); + + device_delete_child(dev, child); + xenbusb_free_child_ivars(ivars); +} + +/** + * \param dev The NewBus device representing this XenBus bus. + * \param child The NewBus device representing a child of dev%'s XenBus bus. + */ +static void +xenbusb_verify_device(device_t dev, device_t child) +{ + if (xs_exists(XST_NIL, xenbus_get_node(child), "") == 0) { + + /* + * Device tree has been removed from Xenbus. + * Tear down the device. + */ + xenbusb_delete_child(dev, child); + } +} + +/** + * \brief Enumerate the devices on a XenBus bus and register them with + * the NewBus device tree. + * + * xenbusb_enumerate_bus() will create entries (in state DS_NOTPRESENT) + * for nodes that appear in the XenStore, but will not invoke probe/attach + * operations on drivers. Probe/Attach processing must be separately + * performed via an invocation of xenbusb_probe_children(). This is usually + * done via the xbs_probe_children task. + * + * \param xbs XenBus Bus device softc of the owner of the bus to enumerate. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xenbusb_enumerate_bus(struct xenbusb_softc *xbs) +{ + const char **types; + u_int type_idx; + u_int type_count; + int error; + + error = xs_directory(XST_NIL, xbs->xbs_node, "", &type_count, &types); + if (error) + return (error); + + for (type_idx = 0; type_idx < type_count; type_idx++) + XENBUSB_ENUMERATE_TYPE(xbs->xbs_dev, types[type_idx]); + + free(types, M_XENSTORE); + + return (0); +} + +/** + * Handler for all generic XenBus device systcl nodes. + */ +static int +xenbusb_device_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + device_t dev; + const char *value; + + dev = (device_t)arg1; + switch (arg2) { + case XENBUS_IVAR_NODE: + value = xenbus_get_node(dev); + break; + case XENBUS_IVAR_TYPE: + value = xenbus_get_type(dev); + break; + case XENBUS_IVAR_STATE: + value = xenbus_strstate(xenbus_get_state(dev)); + break; + case XENBUS_IVAR_OTHEREND_ID: + return (sysctl_handle_int(oidp, NULL, + xenbus_get_otherend_id(dev), + req)); + /* NOTREACHED */ + case XENBUS_IVAR_OTHEREND_PATH: + value = xenbus_get_otherend_path(dev); + break; + default: + return (EINVAL); + } + return (SYSCTL_OUT(req, value, strlen(value))); +} + +/** + * Create read-only systcl nodes for xenbusb device ivar data. + * + * \param dev The XenBus device instance to register with sysctl. + */ +static void +xenbusb_device_sysctl_init(device_t dev) +{ + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + + SYSCTL_ADD_PROC(ctx, + SYSCTL_CHILDREN(tree), + OID_AUTO, + "xenstore_path", + CTLFLAG_RD, + dev, + XENBUS_IVAR_NODE, + xenbusb_device_sysctl_handler, + "A", + "XenStore path to device"); + + SYSCTL_ADD_PROC(ctx, + SYSCTL_CHILDREN(tree), + OID_AUTO, + "xenbus_dev_type", + CTLFLAG_RD, + dev, + XENBUS_IVAR_TYPE, + xenbusb_device_sysctl_handler, + "A", + "XenBus device type"); + + SYSCTL_ADD_PROC(ctx, + SYSCTL_CHILDREN(tree), + OID_AUTO, + "xenbus_connection_state", + CTLFLAG_RD, + dev, + XENBUS_IVAR_STATE, + xenbusb_device_sysctl_handler, + "A", + "XenBus state of peer connection"); + + SYSCTL_ADD_PROC(ctx, + SYSCTL_CHILDREN(tree), + OID_AUTO, + "xenbus_peer_domid", + CTLFLAG_RD, + dev, + XENBUS_IVAR_OTHEREND_ID, + xenbusb_device_sysctl_handler, + "I", + "Xen domain ID of peer"); + + SYSCTL_ADD_PROC(ctx, + SYSCTL_CHILDREN(tree), + OID_AUTO, + "xenstore_peer_path", + CTLFLAG_RD, + dev, + XENBUS_IVAR_OTHEREND_PATH, + xenbusb_device_sysctl_handler, + "A", + "XenStore path to peer device"); +} + +/** + * \brief Verify the existance of attached device instances and perform + * probe/attach processing for newly arrived devices. + * + * \param dev The NewBus device representing this XenBus bus. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xenbusb_probe_children(device_t dev) +{ + device_t *kids; + struct xenbus_device_ivars *ivars; + int i, count; + + if (device_get_children(dev, &kids, &count) == 0) { + for (i = 0; i < count; i++) { + if (device_get_state(kids[i]) != DS_NOTPRESENT) { + /* + * We already know about this one. + * Make sure it's still here. + */ + xenbusb_verify_device(dev, kids[i]); + continue; + } + + if (device_probe_and_attach(kids[i])) { + /* + * Transition device to the closed state + * so the world knows that attachment will + * not occur. + */ + xenbus_set_state(kids[i], XenbusStateClosed); + + /* + * Remove our record of this device. + * So long as it remains in the closed + * state in the XenStore, we will not find + * it again. The state will only change + * if the control domain actively reconfigures + * this device. + */ + xenbusb_delete_child(dev, kids[i]); + + continue; + } + /* + * Augment default newbus provided dynamic sysctl + * variables with the standard ivar contents of + * XenBus devices. + */ + xenbusb_device_sysctl_init(kids[i]); + + /* + * Now that we have a driver managing this device + * that can receive otherend state change events, + * hook up a watch for them. + */ + ivars = device_get_ivars(kids[i]); + xs_register_watch(&ivars->xd_otherend_watch); + } + free(kids, M_TEMP); + } + + return (0); +} + +/** + * \brief Task callback function to perform XenBus probe operations + * from a known safe context. + * + * \param arg The NewBus device_t representing the bus instance to + * on which to perform probe processing. + * \param pending The number of times this task was queued before it could + * be run. + */ +static void +xenbusb_probe_children_cb(void *arg, int pending __unused) +{ + device_t dev = (device_t)arg; + + /* + * Hold Giant until the Giant free newbus changes are committed. + */ + mtx_lock(&Giant); + xenbusb_probe_children(dev); + mtx_unlock(&Giant); +} + +/** + * \brief XenStore watch callback for the root node of the XenStore + * subtree representing a XenBus. + * + * This callback performs, or delegates to the xbs_probe_children task, + * all processing necessary to handle dynmaic device arrival and departure + * events from a XenBus. + * + * \param watch The XenStore watch object associated with this callback. + * \param vec The XenStore watch event data. + * \param len The number of fields in the event data stream. + */ +static void +xenbusb_devices_changed(struct xs_watch *watch, const char **vec, + unsigned int len) +{ + struct xenbusb_softc *xbs; + device_t dev; + char *node; + char *bus; + char *type; + char *id; + char *p; + u_int component; + + xbs = (struct xenbusb_softc *)watch; + dev = xbs->xbs_dev; + + if (len <= XS_WATCH_PATH) { + device_printf(dev, "xenbusb_devices_changed: " + "Short Event Data.\n"); + return; + } + + node = strdup(vec[XS_WATCH_PATH], M_XENBUS); + p = strchr(node, '/'); + if (p == NULL) + goto out; + bus = node; + *p = 0; + type = p + 1; + + p = strchr(type, '/'); + if (p == NULL) + goto out; + *p++ = 0; + + /* + * Extract the device ID. A device ID has one or more path + * components separated by the '/' character. + * + * e.g. "<frontend vm id>/<frontend dev id>" for backend devices. + */ + id = p; + for (component = 0; component < xbs->xbs_id_components; component++) { + p = strchr(p, '/'); + if (p == NULL) + break; + p++; + } + if (p != NULL) + *p = 0; + + if (*id != 0 && component >= xbs->xbs_id_components - 1) { + xenbusb_add_device(xbs->xbs_dev, type, id); + taskqueue_enqueue(taskqueue_thread, &xbs->xbs_probe_children); + } +out: + free(node, M_XENBUS); +} + +/** + * \brief Interrupt configuration hook callback associated with xbs_attch_ch. + * + * Since interrupts are always functional at the time of XenBus configuration, + * there is nothing to be done when the callback occurs. This hook is only + * registered to hold up boot processing while XenBus devices come online. + * + * \param arg Unused configuration hook callback argument. + */ +static void +xenbusb_nop_confighook_cb(void *arg __unused) +{ +} + +/** + * \brief Decrement the number of XenBus child devices in the + * connecting state by one and release the xbs_attch_ch + * interrupt configuration hook if the connecting count + * drops to zero. + * + * \param xbs XenBus Bus device softc of the owner of the bus to enumerate. + */ +static void +xenbusb_release_confighook(struct xenbusb_softc *xbs) +{ + mtx_lock(&xbs->xbs_lock); + KASSERT(xbs->xbs_connecting_children > 0, + ("Connecting device count error\n")); + xbs->xbs_connecting_children--; + if (xbs->xbs_connecting_children == 0 + && (xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) { + xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE; + mtx_unlock(&xbs->xbs_lock); + config_intrhook_disestablish(&xbs->xbs_attach_ch); + } else { + mtx_unlock(&xbs->xbs_lock); + } +} + +/*--------------------------- Public Functions -------------------------------*/ +/*--------- API comments for these methods can be found in xenbusb.h ---------*/ +void +xenbusb_identify(driver_t *driver __unused, device_t parent) +{ + /* + * A single instance of each bus type for which we have a driver + * is always present in a system operating under Xen. + */ + BUS_ADD_CHILD(parent, 0, driver->name, 0); +} + +int +xenbusb_add_device(device_t dev, const char *type, const char *id) +{ + struct xenbusb_softc *xbs; + struct sbuf *devpath_sbuf; + char *devpath; + struct xenbus_device_ivars *ivars; + int error; + + xbs = device_get_softc(dev); + devpath_sbuf = sbuf_new_auto(); + sbuf_printf(devpath_sbuf, "%s/%s/%s", xbs->xbs_node, type, id); + sbuf_finish(devpath_sbuf); + devpath = sbuf_data(devpath_sbuf); + + ivars = malloc(sizeof(*ivars), M_XENBUS, M_ZERO|M_WAITOK); + error = ENXIO; + + if (xs_exists(XST_NIL, devpath, "") != 0) { + device_t child; + enum xenbus_state state; + char *statepath; + + child = xenbusb_device_exists(dev, devpath); + if (child != NULL) { + /* + * We are already tracking this node + */ + error = 0; + goto out; + } + + state = xenbus_read_driver_state(devpath); + if (state != XenbusStateInitialising) { + /* + * Device is not new, so ignore it. This can + * happen if a device is going away after + * switching to Closed. + */ + printf("xenbusb_add_device: Device %s ignored. " + "State %d\n", devpath, state); + error = 0; + goto out; + } + + sx_init(&ivars->xd_lock, "xdlock"); + ivars->xd_flags = XDF_CONNECTING; + ivars->xd_node = strdup(devpath, M_XENBUS); + ivars->xd_type = strdup(type, M_XENBUS); + ivars->xd_state = XenbusStateInitialising; + + error = XENBUSB_GET_OTHEREND_NODE(dev, ivars); + if (error) { + printf("xenbus_update_device: %s no otherend id\n", + devpath); + goto out; + } + + statepath = malloc(strlen(ivars->xd_otherend_path) + + strlen("/state") + 1, M_XENBUS, M_WAITOK); + sprintf(statepath, "%s/state", ivars->xd_otherend_path); + + ivars->xd_otherend_watch.node = statepath; + ivars->xd_otherend_watch.callback = xenbusb_otherend_changed; + + mtx_lock(&xbs->xbs_lock); + xbs->xbs_connecting_children++; + mtx_unlock(&xbs->xbs_lock); + + child = device_add_child(dev, NULL, -1); + ivars->xd_dev = child; + device_set_ivars(child, ivars); + } + +out: + sbuf_delete(devpath_sbuf); + if (error != 0) + xenbusb_free_child_ivars(ivars); + + return (error); +} + +int +xenbusb_attach(device_t dev, char *bus_node, u_int id_components) +{ + struct xenbusb_softc *xbs; + + xbs = device_get_softc(dev); + mtx_init(&xbs->xbs_lock, "xenbusb softc lock", NULL, MTX_DEF); + xbs->xbs_node = bus_node; + xbs->xbs_id_components = id_components; + xbs->xbs_dev = dev; + + /* + * Since XenBus busses are attached to the XenStore, and + * the XenStore does not probe children until after interrupt + * services are available, this config hook is used solely + * to ensure that the remainder of the boot process (e.g. + * mount root) is deferred until child devices are adequately + * probed. We unblock the boot process as soon as the + * connecting child count in our softc goes to 0. + */ + xbs->xbs_attach_ch.ich_func = xenbusb_nop_confighook_cb; + xbs->xbs_attach_ch.ich_arg = dev; + config_intrhook_establish(&xbs->xbs_attach_ch); + xbs->xbs_flags |= XBS_ATTACH_CH_ACTIVE; + xbs->xbs_connecting_children = 1; + + /* + * The subtree for this bus type may not yet exist + * causing initial enumeration to fail. We still + * want to return success from our attach though + * so that we are ready to handle devices for this + * bus when they are dynamically attached to us + * by a Xen management action. + */ + (void)xenbusb_enumerate_bus(xbs); + xenbusb_probe_children(dev); + + xbs->xbs_device_watch.node = bus_node; + xbs->xbs_device_watch.callback = xenbusb_devices_changed; + + TASK_INIT(&xbs->xbs_probe_children, 0, xenbusb_probe_children_cb, dev); + + xs_register_watch(&xbs->xbs_device_watch); + + xenbusb_release_confighook(xbs); + + return (0); +} + +int +xenbusb_resume(device_t dev) +{ + device_t *kids; + struct xenbus_device_ivars *ivars; + int i, count, error; + char *statepath; + + /* + * We must re-examine each device and find the new path for + * its backend. + */ + if (device_get_children(dev, &kids, &count) == 0) { + for (i = 0; i < count; i++) { + if (device_get_state(kids[i]) == DS_NOTPRESENT) + continue; + + ivars = device_get_ivars(kids[i]); + + xs_unregister_watch(&ivars->xd_otherend_watch); + ivars->xd_state = XenbusStateInitialising; + + /* + * Find the new backend details and + * re-register our watch. + */ + error = XENBUSB_GET_OTHEREND_NODE(dev, ivars); + if (error) + return (error); + + DEVICE_RESUME(kids[i]); + + statepath = malloc(strlen(ivars->xd_otherend_path) + + strlen("/state") + 1, M_XENBUS, M_WAITOK); + sprintf(statepath, "%s/state", ivars->xd_otherend_path); + + free(ivars->xd_otherend_watch.node, M_XENBUS); + ivars->xd_otherend_watch.node = statepath; + xs_register_watch(&ivars->xd_otherend_watch); + +#if 0 + /* + * Can't do this yet since we are running in + * the xenwatch thread and if we sleep here, + * we will stop delivering watch notifications + * and the device will never come back online. + */ + sx_xlock(&ivars->xd_lock); + while (ivars->xd_state != XenbusStateClosed + && ivars->xd_state != XenbusStateConnected) + sx_sleep(&ivars->xd_state, &ivars->xd_lock, + 0, "xdresume", 0); + sx_xunlock(&ivars->xd_lock); +#endif + } + free(kids, M_TEMP); + } + + return (0); +} + +int +xenbusb_print_child(device_t dev, device_t child) +{ + struct xenbus_device_ivars *ivars = device_get_ivars(child); + int retval = 0; + + retval += bus_print_child_header(dev, child); + retval += printf(" at %s", ivars->xd_node); + retval += bus_print_child_footer(dev, child); + + return (retval); +} + +int +xenbusb_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) +{ + struct xenbus_device_ivars *ivars = device_get_ivars(child); + + switch (index) { + case XENBUS_IVAR_NODE: + *result = (uintptr_t) ivars->xd_node; + return (0); + + case XENBUS_IVAR_TYPE: + *result = (uintptr_t) ivars->xd_type; + return (0); + + case XENBUS_IVAR_STATE: + *result = (uintptr_t) ivars->xd_state; + return (0); + + case XENBUS_IVAR_OTHEREND_ID: + *result = (uintptr_t) ivars->xd_otherend_id; + return (0); + + case XENBUS_IVAR_OTHEREND_PATH: + *result = (uintptr_t) ivars->xd_otherend_path; + return (0); + } + + return (ENOENT); +} + +int +xenbusb_write_ivar(device_t dev, device_t child, int index, uintptr_t value) +{ + struct xenbus_device_ivars *ivars = device_get_ivars(child); + enum xenbus_state newstate; + int currstate; + + switch (index) { + case XENBUS_IVAR_STATE: + { + int error; + + newstate = (enum xenbus_state) value; + sx_xlock(&ivars->xd_lock); + if (ivars->xd_state == newstate) { + error = 0; + goto out; + } + + error = xs_scanf(XST_NIL, ivars->xd_node, "state", + NULL, "%d", &currstate); + if (error) + goto out; + + do { + error = xs_printf(XST_NIL, ivars->xd_node, "state", + "%d", newstate); + } while (error == EAGAIN); + if (error) { + /* + * Avoid looping through xenbus_dev_fatal() + * which calls xenbus_write_ivar to set the + * state to closing. + */ + if (newstate != XenbusStateClosing) + xenbus_dev_fatal(dev, error, + "writing new state"); + goto out; + } + ivars->xd_state = newstate; + + if ((ivars->xd_flags & XDF_CONNECTING) != 0 + && (newstate == XenbusStateClosed + || newstate == XenbusStateConnected)) { + struct xenbusb_softc *xbs; + + ivars->xd_flags &= ~XDF_CONNECTING; + xbs = device_get_softc(dev); + xenbusb_release_confighook(xbs); + } + + wakeup(&ivars->xd_state); + out: + sx_xunlock(&ivars->xd_lock); + return (error); + } + + case XENBUS_IVAR_NODE: + case XENBUS_IVAR_TYPE: + case XENBUS_IVAR_OTHEREND_ID: + case XENBUS_IVAR_OTHEREND_PATH: + /* + * These variables are read-only. + */ + return (EINVAL); + } + + return (ENOENT); +} diff --git a/sys/xen/xenbus/xenbusb.h b/sys/xen/xenbus/xenbusb.h new file mode 100644 index 0000000..75abb98 --- /dev/null +++ b/sys/xen/xenbus/xenbusb.h @@ -0,0 +1,272 @@ +/*- + * Core definitions and data structures shareable across OS platforms. + * + * Copyright (c) 2010 Spectra Logic Corporation + * Copyright (C) 2008 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ +#ifndef _XEN_XENBUS_XENBUSB_H +#define _XEN_XENBUS_XENBUSB_H + +/** + * \file xenbusb.h + * + * Datastructures and function declarations for use in implementing + * bus attachements (e.g. frontend and backend device busses) for XenBus. + */ +#include "xenbusb_if.h" + +/** + * Enumeration of state flag values for the xbs_flags field of + * the xenbusb_softc structure. + */ +typedef enum { + /** */ + XBS_ATTACH_CH_ACTIVE = 0x01 +} xenbusb_softc_flag; + +/** + * \brief Container for all state needed to manage a Xenbus Bus + * attachment. + */ +struct xenbusb_softc { + /** + * XenStore watch used to monitor the subtree of the + * XenStore where devices for this bus attachment arrive + * and depart. + * + * \note This field must be the first in the softc structure + * so that a simple cast can be used to retrieve the + * softc from within a XenStore watch event callback. + */ + struct xs_watch xbs_device_watch; + + /** Mutex used to protect fields of the xenbusb_softc. */ + struct mtx xbs_lock; + + /** State flags. */ + xenbusb_softc_flag xbs_flags; + + /** + * A dedicated task for processing child arrival and + * departure events. + */ + struct task xbs_probe_children; + + /** + * Config Hook used to block boot processing until + * XenBus devices complete their connection processing + * with other VMs. + */ + struct intr_config_hook xbs_attach_ch; + + /** + * The number of children for this bus that are still + * in the connecting (to other VMs) state. This variable + * is used to determine when to release xbs_attach_ch. + */ + u_int xbs_connecting_children; + + /** The NewBus device_t for this bus attachment. */ + device_t xbs_dev; + + /** + * The VM relative path to the XenStore subtree this + * bus attachment manages. + */ + const char *xbs_node; + + /** + * The number of path components (strings separated by the '/' + * character) that make up the device ID on this bus. + */ + u_int xbs_id_components; +}; + +/** + * Enumeration of state flag values for the xbs_flags field of + * the xenbusb_softc structure. + */ +typedef enum { + + /** + * This device is contributing to the xbs_connecting_children + * count of its parent bus. + */ + XDF_CONNECTING = 0x01 +} xenbus_dev_flag; + +/** Instance variables for devices on a XenBus bus. */ +struct xenbus_device_ivars { + /** + * XenStore watch used to monitor the subtree of the + * XenStore where information about the otherend of + * the split Xen device this device instance represents. + * + * \note This field must be the first in the instance + * variable structure so that a simple cast can be + * used to retrieve ivar data from within a XenStore + * watch event callback. + */ + struct xs_watch xd_otherend_watch; + + /** Sleepable lock used to protect instance data. */ + struct sx xd_lock; + + /** State flags. */ + xenbus_dev_flag xd_flags; + + /** The NewBus device_t for this XenBus device instance. */ + device_t xd_dev; + + /** + * The VM relative path to the XenStore subtree representing + * this VMs half of this device. + */ + char *xd_node; + + /** XenBus device type ("vbd", "vif", etc.). */ + char *xd_type; + + /** + * Cached version of <xd_node>/state node in the XenStore. + */ + enum xenbus_state xd_state; + + /** The VM identifier of the other end of this split device. */ + int xd_otherend_id; + + /** + * The path to the subtree of the XenStore where information + * about the otherend of this split device instance. + */ + char *xd_otherend_path; +}; + +/** + * \brief Identify instances of this device type in the system. + * + * \param driver The driver performing this identify action. + * \param parent The NewBus parent device for any devices this method adds. + */ +void xenbusb_identify(driver_t *driver __unused, device_t parent); + +/** + * \brief Perform common XenBus bus attach processing. + * + * \param dev The NewBus device representing this XenBus bus. + * \param bus_node The XenStore path to the XenStore subtree for + * this XenBus bus. + * \param id_components The number of '/' separated path components that + * make up a unique device ID on this XenBus bus. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * Intiailizes the softc for this bus, installs an interrupt driven + * configuration hook to block boot processing until XenBus devices fully + * configure, performs an initial probe/attach of the bus, and registers + * a XenStore watch so we are notified when the bus topology changes. + */ +int xenbusb_attach(device_t dev, char *bus_node, u_int id_components); + +/** + * \brief Perform common XenBus bus resume handling. + * + * \param dev The NewBus device representing this XenBus bus. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xenbusb_resume(device_t dev); + +/** + * \brief Pretty-prints information about a child of a XenBus bus. + * + * \param dev The NewBus device representing this XenBus bus. + * \param child The NewBus device representing a child of dev%'s XenBus bus. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xenbusb_print_child(device_t dev, device_t child); + +/** + * \brief Common XenBus child instance variable read access method. + * + * \param dev The NewBus device representing this XenBus bus. + * \param child The NewBus device representing a child of dev%'s XenBus bus. + * \param index The index of the instance variable to access. + * \param result The value of the instance variable accessed. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xenbusb_read_ivar(device_t dev, device_t child, int index, + uintptr_t *result); + +/** + * \brief Common XenBus child instance variable write access method. + * + * \param dev The NewBus device representing this XenBus bus. + * \param child The NewBus device representing a child of dev%'s XenBus bus. + * \param index The index of the instance variable to access. + * \param value The new value to set in the instance variable accessed. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xenbusb_write_ivar(device_t dev, device_t child, int index, + uintptr_t value); + +/** + * \brief Attempt to add a XenBus device instance to this XenBus bus. + * + * \param dev The NewBus device representing this XenBus bus. + * \param type The device type being added (e.g. "vbd", "vif"). + * \param id The device ID for this device. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. Failure indicates that either the + * path to this device no longer exists or insufficient + * information exists in the XenStore to create a new + * device. + * + * If successful, this routine will add a device_t with instance + * variable storage to the NewBus device topology. Probe/Attach + * processing is not performed by this routine, but must be scheduled + * via the xbs_probe_children task. This separation of responsibilities + * is required to avoid hanging up the XenStore event delivery thread + * with our probe/attach work in the event a device is added via + * a callback from the XenStore. + */ +int xenbusb_add_device(device_t dev, const char *type, const char *id); + +#endif /* _XEN_XENBUS_XENBUSB_H */ diff --git a/sys/xen/xenbus/xenbusb_back.c b/sys/xen/xenbus/xenbusb_back.c new file mode 100644 index 0000000..32bbc04 --- /dev/null +++ b/sys/xen/xenbus/xenbusb_back.c @@ -0,0 +1,295 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2009, 2010 Spectra Logic Corporation + * Copyright (C) 2008 Doug Rabson + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005 XenSource Ltd + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * \file xenbusb_back.c + * + * XenBus management of the NewBus bus containing the backend instances of + * Xen split devices. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> + +#include <machine/xen/xen-os.h> +#include <machine/stdarg.h> + +#include <xen/gnttab.h> +#include <xen/xenbus/xenbusvar.h> +#include <xen/xenbus/xenbusb.h> + + +/*------------------ Private Device Attachment Functions --------------------*/ +/** + * \brief Probe for the existance of the XenBus back bus. + * + * \param dev NewBus device_t for this XenBus back bus instance. + * + * \return Always returns 0 indicating success. + */ +static int +xenbusb_back_probe(device_t dev) +{ + device_set_desc(dev, "Xen Backend Devices"); + + return (0); +} + +/** + * \brief Attach the XenBus back bus. + * + * \param dev NewBus device_t for this XenBus back bus instance. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xenbusb_back_attach(device_t dev) +{ + struct xenbusb_softc *xbs; + int error; + + xbs = device_get_softc(dev); + error = xenbusb_attach(dev, "backend", /*id_components*/2); + + /* + * Backend devices operate to serve other domains, + * so there is no need to hold up boot processing + * while connections to foreign domains are made. + */ + mtx_lock(&xbs->xbs_lock); + if ((xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) { + xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE; + mtx_unlock(&xbs->xbs_lock); + config_intrhook_disestablish(&xbs->xbs_attach_ch); + } else { + mtx_unlock(&xbs->xbs_lock); + } + + return (error); +} + +/** + * \brief Enumerate all devices of the given type on this bus. + * + * \param dev NewBus device_t for this XenBus backend bus instance. + * \param type String indicating the device sub-tree (e.g. "vfb", "vif") + * to enumerate. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * Devices that are found are entered into the NewBus hierarchy via + * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects + * and ignores duplicate devices, so it can be called unconditionally + * for any device found in the XenStore. + * + * The backend XenStore hierarchy has the following format: + * + * backend/<device type>/<frontend vm id>/<device id> + * + */ +static int +xenbusb_back_enumerate_type(device_t dev, const char *type) +{ + struct xenbusb_softc *xbs; + const char **vms; + u_int vm_idx; + u_int vm_count; + int error; + + xbs = device_get_softc(dev); + error = xs_directory(XST_NIL, xbs->xbs_node, type, &vm_count, &vms); + if (error) + return (error); + for (vm_idx = 0; vm_idx < vm_count; vm_idx++) { + struct sbuf *vm_path; + const char *vm; + const char **devs; + u_int dev_idx; + u_int dev_count; + + vm = vms[vm_idx]; + + vm_path = xs_join(type, vm); + error = xs_directory(XST_NIL, xbs->xbs_node, sbuf_data(vm_path), + &dev_count, &devs); + sbuf_delete(vm_path); + if (error) + break; + + for (dev_idx = 0; dev_idx < dev_count; dev_idx++) { + const char *dev_num; + struct sbuf *id; + + dev_num = devs[dev_idx]; + id = xs_join(vm, dev_num); + xenbusb_add_device(dev, type, sbuf_data(id)); + sbuf_delete(id); + } + free(devs, M_XENSTORE); + } + + free(vms, M_XENSTORE); + + return (0); +} + +/** + * \brief Determine and store the XenStore path for the other end of + * a split device whose local end is represented by ivars. + * + * \param dev NewBus device_t for this XenBus backend bus instance. + * \param ivars Instance variables from the XenBus child device for + * which to perform this function. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * If successful, the xd_otherend_path field of the child's instance + * variables will be updated. + * + */ +static int +xenbusb_back_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars) +{ + char *otherend_path; + int error; + + if (ivars->xd_otherend_path != NULL) { + free(ivars->xd_otherend_path, M_XENBUS); + ivars->xd_otherend_path = NULL; + } + + error = xs_gather(XST_NIL, ivars->xd_node, + "frontend-id", "%i", &ivars->xd_otherend_id, + "frontend", NULL, &otherend_path, + NULL); + + if (error == 0) { + ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS); + free(otherend_path, M_XENSTORE); + } + return (error); +} + +/** + * \brief Backend XenBus child instance variable write access method. + * + * \param dev The NewBus device representing this XenBus bus. + * \param child The NewBus device representing a child of dev%'s XenBus bus. + * \param index The index of the instance variable to access. + * \param value The new value to set in the instance variable accessed. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * Xenbus_back overrides this method so that it can trap state transitions + * of local backend devices and clean up their XenStore entries as necessary + * during device instance teardown. + */ +static int +xenbusb_back_write_ivar(device_t dev, device_t child, int index, + uintptr_t value) +{ + int error; + + error = xenbusb_write_ivar(dev, child, index, value); + + if (index == XENBUS_IVAR_STATE + && (enum xenbus_state)value == XenbusStateClosed + && xenbus_dev_is_online(child) == 0) { + + /* + * Cleanup the hotplug entry in the XenStore if + * present. The control domain expects any userland + * component associated with this device to destroy + * this node in order to signify it is safe to + * teardown the device. However, not all backends + * rely on userland components, and those that + * do should either use a communication channel + * other than the XenStore, or ensure the hotplug + * data is already cleaned up. + * + * This removal ensures that no matter what path + * is taken to mark a back-end closed, the control + * domain will understand that it is closed. + */ + xs_rm(XST_NIL, xenbus_get_node(child), "hotplug-status"); + } + + return (error); +} + +/*-------------------- Private Device Attachment Data -----------------------*/ +static device_method_t xenbusb_back_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenbusb_identify), + DEVMETHOD(device_probe, xenbusb_back_probe), + DEVMETHOD(device_attach, xenbusb_back_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus Interface */ + DEVMETHOD(bus_print_child, xenbusb_print_child), + DEVMETHOD(bus_read_ivar, xenbusb_read_ivar), + DEVMETHOD(bus_write_ivar, xenbusb_back_write_ivar), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + + /* XenBus Bus Interface */ + DEVMETHOD(xenbusb_enumerate_type, xenbusb_back_enumerate_type), + DEVMETHOD(xenbusb_get_otherend_node, xenbusb_back_get_otherend_node), + { 0, 0 } +}; + +DEFINE_CLASS_0(xenbusb_back, xenbusb_back_driver, xenbusb_back_methods, + sizeof(struct xenbusb_softc)); +devclass_t xenbusb_back_devclass; + +DRIVER_MODULE(xenbusb_back, xenstore, xenbusb_back_driver, + xenbusb_back_devclass, 0, 0); diff --git a/sys/xen/xenbus/xenbusb_front.c b/sys/xen/xenbus/xenbusb_front.c new file mode 100644 index 0000000..0bc06a4 --- /dev/null +++ b/sys/xen/xenbus/xenbusb_front.c @@ -0,0 +1,195 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2009, 2010 Spectra Logic Corporation + * Copyright (C) 2008 Doug Rabson + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005 XenSource Ltd + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * \file xenbusb_front.c + * + * XenBus management of the NewBus bus containing the frontend instances of + * Xen split devices. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> + +#include <machine/xen/xen-os.h> +#include <machine/stdarg.h> + +#include <xen/gnttab.h> +#include <xen/xenbus/xenbusvar.h> +#include <xen/xenbus/xenbusb.h> + + +/*------------------ Private Device Attachment Functions --------------------*/ +/** + * \brief Probe for the existance of the XenBus front bus. + * + * \param dev NewBus device_t for this XenBus front bus instance. + * + * \return Always returns 0 indicating success. + */ +static int +xenbusb_front_probe(device_t dev) +{ + device_set_desc(dev, "Xen Frontend Devices"); + + return (0); +} + +/** + * \brief Attach the XenBus front bus. + * + * \param dev NewBus device_t for this XenBus front bus instance. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xenbusb_front_attach(device_t dev) +{ + return (xenbusb_attach(dev, "device", /*id_components*/1)); +} + +/** + * \brief Enumerate all devices of the given type on this bus. + * + * \param dev NewBus device_t for this XenBus front bus instance. + * \param type String indicating the device sub-tree (e.g. "vfb", "vif") + * to enumerate. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * Devices that are found are entered into the NewBus hierarchy via + * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects + * and ignores duplicate devices, so it can be called unconditionally + * for any device found in the XenStore. + */ +static int +xenbusb_front_enumerate_type(device_t dev, const char *type) +{ + struct xenbusb_softc *xbs; + const char **dir; + unsigned int i, count; + int error; + + xbs = device_get_softc(dev); + error = xs_directory(XST_NIL, xbs->xbs_node, type, &count, &dir); + if (error) + return (error); + for (i = 0; i < count; i++) + xenbusb_add_device(dev, type, dir[i]); + + free(dir, M_XENSTORE); + + return (0); +} + +/** + * \brief Determine and store the XenStore path for the other end of + * a split device whose local end is represented by ivars. + * + * If successful, the xd_otherend_path field of the child's instance + * variables will be updated. + * + * \param dev NewBus device_t for this XenBus front bus instance. + * \param ivars Instance variables from the XenBus child device for + * which to perform this function. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xenbusb_front_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars) +{ + char *otherend_path; + int error; + + if (ivars->xd_otherend_path != NULL) { + free(ivars->xd_otherend_path, M_XENBUS); + ivars->xd_otherend_path = NULL; + } + + error = xs_gather(XST_NIL, ivars->xd_node, + "backend-id", "%i", &ivars->xd_otherend_id, + "backend", NULL, &otherend_path, + NULL); + + if (error == 0) { + ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS); + free(otherend_path, M_XENSTORE); + } + return (error); +} + +/*-------------------- Private Device Attachment Data -----------------------*/ +static device_method_t xenbusb_front_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenbusb_identify), + DEVMETHOD(device_probe, xenbusb_front_probe), + DEVMETHOD(device_attach, xenbusb_front_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus Interface */ + DEVMETHOD(bus_print_child, xenbusb_print_child), + DEVMETHOD(bus_read_ivar, xenbusb_read_ivar), + DEVMETHOD(bus_write_ivar, xenbusb_write_ivar), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + + /* XenBus Bus Interface */ + DEVMETHOD(xenbusb_enumerate_type, xenbusb_front_enumerate_type), + DEVMETHOD(xenbusb_get_otherend_node, xenbusb_front_get_otherend_node), + { 0, 0 } +}; + +DEFINE_CLASS_0(xenbusb_front, xenbusb_front_driver, xenbusb_front_methods, + sizeof(struct xenbusb_softc)); +devclass_t xenbusb_front_devclass; + +DRIVER_MODULE(xenbusb_front, xenstore, xenbusb_front_driver, + xenbusb_front_devclass, 0, 0); diff --git a/sys/xen/xenbus/xenbusb_if.m b/sys/xen/xenbus/xenbusb_if.m new file mode 100644 index 0000000..a32e3f6 --- /dev/null +++ b/sys/xen/xenbus/xenbusb_if.m @@ -0,0 +1,78 @@ +#- +# Copyright (c) 2010 Spectra Logic Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions, and the following disclaimer, +# without modification. +# 2. Redistributions in binary form must reproduce at minimum a disclaimer +# substantially similar to the "NO WARRANTY" disclaimer below +# ("Disclaimer") and any redistribution must be conditioned upon +# including a substantially similar Disclaimer requirement for further +# binary redistribution. +# +# NO WARRANTY +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGES. +# +# $FreeBSD$ +# + +#include <sys/bus.h> + +HEADER { +struct xenbus_device_ivars; +} + +INTERFACE xenbusb; + +/** + * \brief Enumerate all devices of the given type on this bus. + * + * \param _dev NewBus device_t for this XenBus (front/back) bus instance. + * \param _type String indicating the device sub-tree (e.g. "vfb", "vif") + * to enumerate. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * Devices that are found should be entered into the NewBus hierarchy via + * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects + * and ignores duplicate devices, so it can be called unconditionally + * for any device found in the XenStore. + */ +METHOD int enumerate_type { + device_t _dev; + const char *_type; +}; + +/** + * \brief Determine and store the XenStore path for the other end of + * a split device whose local end is represented by ivars. + * + * If successful, the xd_otherend_path field of the child's instance + * variables must be updated. + * + * \param _dev NewBus device_t for this XenBus (front/back) bus instance. + * \param _ivars Instance variables from the XenBus child device for + * which to perform this function. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +METHOD int get_otherend_node { + device_t _dev; + struct xenbus_device_ivars *_ivars; +} diff --git a/sys/xen/xenbus/xenbusvar.h b/sys/xen/xenbus/xenbusvar.h index 6511664..55d7f29 100644 --- a/sys/xen/xenbus/xenbusvar.h +++ b/sys/xen/xenbus/xenbusvar.h @@ -1,8 +1,4 @@ /****************************************************************************** - * xenbus.h - * - * Talks to Xen Store to figure out what devices we have. - * * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2005 XenSource Ltd. * @@ -30,46 +26,64 @@ * $FreeBSD$ */ +/** + * \file xenbusvar.h + * + * \brief Datastructures and function declarations for usedby device + * drivers operating on the XenBus. + */ + #ifndef _XEN_XENBUS_XENBUSVAR_H #define _XEN_XENBUS_XENBUSVAR_H #include <sys/queue.h> #include <sys/bus.h> #include <sys/eventhandler.h> +#include <sys/malloc.h> +#include <sys/sbuf.h> + +#include <machine/stdarg.h> #include <machine/xen/xen-os.h> + +#include <xen/interface/grant_table.h> #include <xen/interface/io/xenbus.h> #include <xen/interface/io/xs_wire.h> +#include <xen/xenstore/xenstorevar.h> + #include "xenbus_if.h" +/* XenBus allocations including XenStore data returned to clients. */ +MALLOC_DECLARE(M_XENBUS); + enum { - /* + /** * Path of this device node. */ XENBUS_IVAR_NODE, - /* + /** * The device type (e.g. vif, vbd). */ XENBUS_IVAR_TYPE, - /* + /** * The state of this device (not the otherend's state). */ XENBUS_IVAR_STATE, - /* + /** * Domain ID of the other end device. */ XENBUS_IVAR_OTHEREND_ID, - /* + /** * Path of the other end device. */ XENBUS_IVAR_OTHEREND_PATH }; -/* +/** * Simplified accessors for xenbus devices */ #define XENBUS_ACCESSOR(var, ivar, type) \ @@ -81,179 +95,184 @@ XENBUS_ACCESSOR(state, STATE, enum xenbus_state) XENBUS_ACCESSOR(otherend_id, OTHEREND_ID, int) XENBUS_ACCESSOR(otherend_path, OTHEREND_PATH, const char *) -/* Register callback to watch this node. */ -struct xenbus_watch -{ - LIST_ENTRY(xenbus_watch) list; - - /* Path being watched. */ - char *node; - - /* Callback (executed in a process context with no locks held). */ - void (*callback)(struct xenbus_watch *, - const char **vec, unsigned int len); -}; - -typedef int (*xenstore_event_handler_t)(void *); - -struct xenbus_transaction -{ - uint32_t id; -}; - -#define XBT_NIL ((struct xenbus_transaction) { 0 }) - -int xenbus_directory(struct xenbus_transaction t, const char *dir, - const char *node, unsigned int *num, char ***result); -int xenbus_read(struct xenbus_transaction t, const char *dir, - const char *node, unsigned int *len, void **result); -int xenbus_write(struct xenbus_transaction t, const char *dir, - const char *node, const char *string); -int xenbus_mkdir(struct xenbus_transaction t, const char *dir, - const char *node); -int xenbus_exists(struct xenbus_transaction t, const char *dir, - const char *node); -int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node); -int xenbus_transaction_start(struct xenbus_transaction *t); -int xenbus_transaction_end(struct xenbus_transaction t, int abort); - -/* - * Single read and scanf: returns errno or zero. If scancountp is - * non-null, then number of items scanned is returned in *scanncountp. - */ -int xenbus_scanf(struct xenbus_transaction t, - const char *dir, const char *node, int *scancountp, const char *fmt, ...) - __attribute__((format(scanf, 5, 6))); - -/* Single printf and write: returns errno or 0. */ -int xenbus_printf(struct xenbus_transaction t, - const char *dir, const char *node, const char *fmt, ...) - __attribute__((format(printf, 4, 5))); - -/* - * Generic read function: NULL-terminated triples of name, - * sprintf-style type string, and pointer. Returns 0 or errno. +/** + * Return the state of a XenBus device. + * + * \param path The root XenStore path for the device. + * + * \return The current state of the device or XenbusStateClosed if no + * state can be read. */ -int xenbus_gather(struct xenbus_transaction t, const char *dir, ...); - -/* notifer routines for when the xenstore comes up */ -int register_xenstore_notifier(xenstore_event_handler_t func, void *arg, int priority); -#if 0 -void unregister_xenstore_notifier(); -#endif -int register_xenbus_watch(struct xenbus_watch *watch); -void unregister_xenbus_watch(struct xenbus_watch *watch); -void xs_suspend(void); -void xs_resume(void); - -/* Used by xenbus_dev to borrow kernel's store connection. */ -int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result); - -#if 0 - -#define XENBUS_IS_ERR_READ(str) ({ \ - if (!IS_ERR(str) && strlen(str) == 0) { \ - free(str, M_DEVBUF); \ - str = ERR_PTR(-ERANGE); \ - } \ - IS_ERR(str); \ -}) - -#endif - -#define XENBUS_EXIST_ERR(err) ((err) == ENOENT || (err) == ERANGE) - +XenbusState xenbus_read_driver_state(const char *path); /** - * Register a watch on the given path, using the given xenbus_watch structure - * for storage, and the given callback function as the callback. Return 0 on - * success, or errno on error. On success, the given path will be saved as - * watch->node, and remains the caller's to free. On error, watch->node will - * be NULL, the device will switch to XenbusStateClosing, and the error will - * be saved in the store. + * Initialize and register a watch on the given path (client suplied storage). + * + * \param dev The XenBus device requesting the watch service. + * \param path The XenStore path of the object to be watched. The + * storage for this string must be stable for the lifetime + * of the watch. + * \param watch The watch object to use for this request. This object + * must be stable for the lifetime of the watch. + * \param callback The function to call when XenStore objects at or below + * path are modified. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * \note On error, the device 'dev' will be switched to the XenbusStateClosing + * state and the returned error is saved in the per-device error node + * for dev in the XenStore. */ int xenbus_watch_path(device_t dev, char *path, - struct xenbus_watch *watch, - void (*callback)(struct xenbus_watch *, - const char **, unsigned int)); - + struct xs_watch *watch, + xs_watch_cb_t *callback); /** - * Register a watch on the given path/path2, using the given xenbus_watch - * structure for storage, and the given callback function as the callback. - * Return 0 on success, or errno on error. On success, the watched path - * (path/path2) will be saved as watch->node, and becomes the caller's to - * kfree(). On error, watch->node will be NULL, so the caller has nothing to - * free, the device will switch to XenbusStateClosing, and the error will be - * saved in the store. + * Initialize and register a watch at path/path2 in the XenStore. + * + * \param dev The XenBus device requesting the watch service. + * \param path The base XenStore path of the object to be watched. + * \param path2 The tail XenStore path of the object to be watched. + * \param watch The watch object to use for this request. This object + * must be stable for the lifetime of the watch. + * \param callback The function to call when XenStore objects at or below + * path are modified. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * \note On error, \a dev will be switched to the XenbusStateClosing + * state and the returned error is saved in the per-device error node + * for \a dev in the XenStore. + * + * Similar to xenbus_watch_path, however the storage for the path to the + * watched object is allocated from the heap and filled with "path '/' path2". + * Should a call to this function succeed, it is the callers responsibility + * to free watch->node using the M_XENBUS malloc type. */ int xenbus_watch_path2(device_t dev, const char *path, - const char *path2, struct xenbus_watch *watch, - void (*callback)(struct xenbus_watch *, - const char **, unsigned int)); - + const char *path2, struct xs_watch *watch, + xs_watch_cb_t *callback); /** - * Advertise in the store a change of the given driver to the given new_state. - * which case this is performed inside its own transaction. Return 0 on - * success, or errno on error. On error, the device will switch to - * XenbusStateClosing, and the error will be saved in the store. + * Grant access to the given ring_mfn to the peer of the given device. + * + * \param dev The device granting access to the ring page. + * \param ring_mfn The guest machine page number of the page to grant + * peer access rights. + * \param refp[out] The grant reference for the page. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * A successful call to xenbus_grant_ring should be paired with a call + * to gnttab_end_foreign_access() when foregn access to this page is no + * longer requried. + * + * \note On error, \a dev will be switched to the XenbusStateClosing + * state and the returned error is saved in the per-device error node + * for \a dev in the XenStore. */ -int xenbus_switch_state(device_t dev, - XenbusState new_state); - +int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp); /** - * Grant access to the given ring_mfn to the peer of the given device. - * Return 0 on success, or errno on error. On error, the device will - * switch to XenbusStateClosing, and the error will be saved in the - * store. The grant ring reference is returned in *refp. + * Allocate an event channel for the given XenBus device. + * + * \param dev The device for which to allocate the event channel. + * \param port[out] The port identifier for the allocated event channel. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * A successfully allocated event channel should be free'd using + * xenbus_free_evtchn(). + * + * \note On error, \a dev will be switched to the XenbusStateClosing + * state and the returned error is saved in the per-device error node + * for \a dev in the XenStore. */ -int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp); - +int xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port); /** - * Allocate an event channel for the given xenbus_device, assigning the newly - * created local port to *port. Return 0 on success, or errno on error. On - * error, the device will switch to XenbusStateClosing, and the error will be - * saved in the store. + * Free an existing event channel. + * + * \param dev The device which allocated this event channel. + * \param port The port identifier for the event channel to free. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * \note On error, \a dev will be switched to the XenbusStateClosing + * state and the returned error is saved in the per-device error node + * for \a dev in the XenStore. */ -int xenbus_alloc_evtchn(device_t dev, int *port); - +int xenbus_free_evtchn(device_t dev, evtchn_port_t port); /** - * Free an existing event channel. Returns 0 on success or errno on error. + * Record the given errno, along with the given, printf-style, formatted + * message in dev's device specific error node in the XenStore. + * + * \param dev The device which encountered the error. + * \param err The errno value corresponding to the error. + * \param fmt Printf format string followed by a variable number of + * printf arguments. */ -int xenbus_free_evtchn(device_t dev, int port); - +void xenbus_dev_error(device_t dev, int err, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); /** - * Return the state of the driver rooted at the given store path, or - * XenbusStateClosed if no state can be read. + * va_list version of xenbus_dev_error(). + * + * \param dev The device which encountered the error. + * \param err The errno value corresponding to the error. + * \param fmt Printf format string. + * \param ap Va_list of printf arguments. */ -XenbusState xenbus_read_driver_state(const char *path); +void xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap) + __attribute__((format(printf, 3, 0))); - -/*** - * Report the given negative errno into the store, along with the given - * formatted message. +/** + * Equivalent to xenbus_dev_error(), followed by + * xenbus_set_state(dev, XenbusStateClosing). + * + * \param dev The device which encountered the error. + * \param err The errno value corresponding to the error. + * \param fmt Printf format string followed by a variable number of + * printf arguments. */ -void xenbus_dev_error(device_t dev, int err, const char *fmt, - ...); - +void xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); -/*** - * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by - * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly - * closedown of this driver and its peer. +/** + * va_list version of xenbus_dev_fatal(). + * + * \param dev The device which encountered the error. + * \param err The errno value corresponding to the error. + * \param fmt Printf format string. + * \param ap Va_list of printf arguments. */ -void xenbus_dev_fatal(device_t dev, int err, const char *fmt, - ...); - -int xenbus_dev_init(void); +void xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list) + __attribute__((format(printf, 3, 0))); +/** + * Convert a member of the xenbus_state enum into an ASCII string. + * + * /param state The XenBus state to lookup. + * + * /return A string representing state or, for unrecognized states, + * the string "Unknown". + */ const char *xenbus_strstate(enum xenbus_state state); + +/** + * Return the value of a XenBus device's "online" node within the XenStore. + * + * \param dev The XenBus device to query. + * + * \return The value of the "online" node for the device. If the node + * does not exist, 0 (offline) is returned. + */ int xenbus_dev_is_online(device_t dev); -int xenbus_frontend_closed(device_t dev); #endif /* _XEN_XENBUS_XENBUSVAR_H */ diff --git a/sys/xen/xenstore/xenstore.c b/sys/xen/xenstore/xenstore.c new file mode 100644 index 0000000..76dfb5a --- /dev/null +++ b/sys/xen/xenstore/xenstore.c @@ -0,0 +1,1654 @@ +/****************************************************************************** + * xenstore.c + * + * Low-level kernel interface to the XenStore. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2009,2010 Spectra Logic Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/syslog.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/sbuf.h> +#include <sys/sysctl.h> +#include <sys/uio.h> +#include <sys/unistd.h> + +#include <machine/xen/xen-os.h> +#include <machine/stdarg.h> + +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/hypervisor.h> +#include <xen/xen_intr.h> + +#include <xen/interface/hvm/params.h> + +#include <xen/xenstore/xenstorevar.h> +#include <xen/xenstore/xenstore_internal.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +/** + * \file xenstore.c + * \brief XenStore interface + * + * The XenStore interface is a simple storage system that is a means of + * communicating state and configuration data between the Xen Domain 0 + * and the various guest domains. All configuration data other than + * a small amount of essential information required during the early + * boot process of launching a Xen aware guest, is managed using the + * XenStore. + * + * The XenStore is ASCII string based, and has a structure and semantics + * similar to a filesystem. There are files and directories, the directories + * able to contain files or other directories. The depth of the hierachy + * is only limited by the XenStore's maximum path length. + * + * The communication channel between the XenStore service and other + * domains is via two, guest specific, ring buffers in a shared memory + * area. One ring buffer is used for communicating in each direction. + * The grant table references for this shared memory are given to the + * guest either via the xen_start_info structure for a fully para- + * virtualized guest, or via HVM hypercalls for a hardware virtualized + * guest. + * + * The XenStore communication relies on an event channel and thus + * interrupts. For this reason, the attachment of the XenStore + * relies on an interrupt driven configuration hook to hold off + * boot processing until communication with the XenStore service + * can be established. + * + * Several Xen services depend on the XenStore, most notably the + * XenBus used to discover and manage Xen devices. These services + * are implemented as NewBus child attachments to a bus exported + * by this XenStore driver. + */ + +static struct xs_watch *find_watch(const char *token); + +MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results"); + +/** + * Pointer to shared memory communication structures allowing us + * to communicate with the XenStore service. + * + * When operating in full PV mode, this pointer is set early in kernel + * startup from within xen_machdep.c. In HVM mode, we use hypercalls + * to get the guest frame number for the shared page and then map it + * into kva. See xs_init() for details. + */ +struct xenstore_domain_interface *xen_store; + +/*-------------------------- Private Data Structures ------------------------*/ + +/** + * Structure capturing messages received from the XenStore service. + */ +struct xs_stored_msg { + TAILQ_ENTRY(xs_stored_msg) list; + + struct xsd_sockmsg hdr; + + union { + /* Queued replies. */ + struct { + char *body; + } reply; + + /* Queued watch events. */ + struct { + struct xs_watch *handle; + const char **vec; + u_int vec_size; + } watch; + } u; +}; +TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg); + +/** + * Container for all XenStore related state. + */ +struct xs_softc { + /** Newbus device for the XenStore. */ + device_t xs_dev; + + /** + * Lock serializing access to ring producer/consumer + * indexes. Use of this lock guarantees that wakeups + * of blocking readers/writers are not missed due to + * races with the XenStore service. + */ + struct mtx ring_lock; + + /* + * Mutex used to insure exclusive access to the outgoing + * communication ring. We use a lock type that can be + * held while sleeping so that xs_write() can block waiting + * for space in the ring to free up, without allowing another + * writer to come in and corrupt a partial message write. + */ + struct sx request_mutex; + + /** + * A list of replies to our requests. + * + * The reply list is filled by xs_rcv_thread(). It + * is consumed by the context that issued the request + * to which a reply is made. The requester blocks in + * xs_read_reply(). + * + * /note Only one requesting context can be active at a time. + * This is guaranteed by the request_mutex and insures + * that the requester sees replies matching the order + * of its requests. + */ + struct xs_stored_msg_list reply_list; + + /** Lock protecting the reply list. */ + struct mtx reply_lock; + + /** + * List of registered watches. + */ + struct xs_watch_list registered_watches; + + /** Lock protecting the registered watches list. */ + struct mtx registered_watches_lock; + + /** + * List of pending watch callback events. + */ + struct xs_stored_msg_list watch_events; + + /** Lock protecting the watch calback list. */ + struct mtx watch_events_lock; + + /** + * Sleepable lock used to prevent VM suspension while a + * xenstore transaction is outstanding. + * + * Each active transaction holds a shared lock on the + * suspend mutex. Our suspend method blocks waiting + * to acquire an exclusive lock. This guarantees that + * suspend processing will only proceed once all active + * transactions have been retired. + */ + struct sx suspend_mutex; + + /** + * The processid of the xenwatch thread. + */ + pid_t xenwatch_pid; + + /** + * Sleepable mutex used to gate the execution of XenStore + * watch event callbacks. + * + * xenwatch_thread holds an exclusive lock on this mutex + * while delivering event callbacks, and xenstore_unregister_watch() + * uses an exclusive lock of this mutex to guarantee that no + * callbacks of the just unregistered watch are pending + * before returning to its caller. + */ + struct sx xenwatch_mutex; + +#ifdef XENHVM + /** + * The HVM guest pseudo-physical frame number. This is Xen's mapping + * of the true machine frame number into our "physical address space". + */ + unsigned long gpfn; +#endif + + /** + * The event channel for communicating with the + * XenStore service. + */ + int evtchn; + + /** Interrupt number for our event channel. */ + u_int irq; + + /** + * Interrupt driven config hook allowing us to defer + * attaching children until interrupts (and thus communication + * with the XenStore service) are available. + */ + struct intr_config_hook xs_attachcb; +}; + +/*-------------------------------- Global Data ------------------------------*/ +static struct xs_softc xs; + +/*------------------------- Private Utility Functions -----------------------*/ + +/** + * Count and optionally record pointers to a number of NUL terminated + * strings in a buffer. + * + * \param strings A pointer to a contiguous buffer of NUL terminated strings. + * \param dest An array to store pointers to each string found in strings. + * \param len The length of the buffer pointed to by strings. + * + * \return A count of the number of strings found. + */ +static u_int +extract_strings(const char *strings, const char **dest, u_int len) +{ + u_int num; + const char *p; + + for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) { + if (dest != NULL) + *dest++ = p; + num++; + } + + return (num); +} + +/** + * Convert a contiguous buffer containing a series of NUL terminated + * strings into an array of pointers to strings. + * + * The returned pointer references the array of string pointers which + * is followed by the storage for the string data. It is the client's + * responsibility to free this storage. + * + * The storage addressed by strings is free'd prior to split returning. + * + * \param strings A pointer to a contiguous buffer of NUL terminated strings. + * \param len The length of the buffer pointed to by strings. + * \param num The number of strings found and returned in the strings + * array. + * + * \return An array of pointers to the strings found in the input buffer. + */ +static const char ** +split(char *strings, u_int len, u_int *num) +{ + const char **ret; + + /* Protect against unterminated buffers. */ + strings[len - 1] = '\0'; + + /* Count the strings. */ + *num = extract_strings(strings, /*dest*/NULL, len); + + /* Transfer to one big alloc for easy freeing by the caller. */ + ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK); + memcpy(&ret[*num], strings, len); + free(strings, M_XENSTORE); + + /* Extract pointers to newly allocated array. */ + strings = (char *)&ret[*num]; + (void)extract_strings(strings, /*dest*/ret, len); + + return (ret); +} + +/*------------------------- Public Utility Functions -------------------------*/ +/*------- API comments for these methods can be found in xenstorevar.h -------*/ +struct sbuf * +xs_join(const char *dir, const char *name) +{ + struct sbuf *sb; + + sb = sbuf_new_auto(); + sbuf_cat(sb, dir); + if (name[0] != '\0') { + sbuf_putc(sb, '/'); + sbuf_cat(sb, name); + } + sbuf_finish(sb); + + return (sb); +} + +/*-------------------- Low Level Communication Management --------------------*/ +/** + * Interrupt handler for the XenStore event channel. + * + * XenStore reads and writes block on "xen_store" for buffer + * space. Wakeup any blocking operations when the XenStore + * service has modified the queues. + */ +static void +xs_intr(void * arg __unused /*__attribute__((unused))*/) +{ + + /* + * Hold ring lock across wakeup so that clients + * cannot miss a wakeup. + */ + mtx_lock(&xs.ring_lock); + wakeup(xen_store); + mtx_unlock(&xs.ring_lock); +} + +/** + * Verify that the indexes for a ring are valid. + * + * The difference between the producer and consumer cannot + * exceed the size of the ring. + * + * \param cons The consumer index for the ring to test. + * \param prod The producer index for the ring to test. + * + * \retval 1 If indexes are in range. + * \retval 0 If the indexes are out of range. + */ +static int +xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) +{ + + return ((prod - cons) <= XENSTORE_RING_SIZE); +} + +/** + * Return a pointer to, and the length of, the contiguous + * free region available for output in a ring buffer. + * + * \param cons The consumer index for the ring. + * \param prod The producer index for the ring. + * \param buf The base address of the ring's storage. + * \param len The amount of contiguous storage available. + * + * \return A pointer to the start location of the free region. + */ +static void * +xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, + char *buf, uint32_t *len) +{ + + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); + if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) + *len = XENSTORE_RING_SIZE - (prod - cons); + return (buf + MASK_XENSTORE_IDX(prod)); +} + +/** + * Return a pointer to, and the length of, the contiguous + * data available to read from a ring buffer. + * + * \param cons The consumer index for the ring. + * \param prod The producer index for the ring. + * \param buf The base address of the ring's storage. + * \param len The amount of contiguous data available to read. + * + * \return A pointer to the start location of the available data. + */ +static const void * +xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, + const char *buf, uint32_t *len) +{ + + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); + if ((prod - cons) < *len) + *len = prod - cons; + return (buf + MASK_XENSTORE_IDX(cons)); +} + +/** + * Transmit data to the XenStore service. + * + * \param tdata A pointer to the contiguous data to send. + * \param len The amount of data to send. + * + * \return On success 0, otherwise an errno value indicating the + * cause of failure. + * + * \invariant Called from thread context. + * \invariant The buffer pointed to by tdata is at least len bytes + * in length. + * \invariant xs.request_mutex exclusively locked. + */ +static int +xs_write_store(const void *tdata, unsigned len) +{ + XENSTORE_RING_IDX cons, prod; + const char *data = (const char *)tdata; + int error; + + sx_assert(&xs.request_mutex, SX_XLOCKED); + while (len != 0) { + void *dst; + u_int avail; + + /* Hold lock so we can't miss wakeups should we block. */ + mtx_lock(&xs.ring_lock); + cons = xen_store->req_cons; + prod = xen_store->req_prod; + if ((prod - cons) == XENSTORE_RING_SIZE) { + /* + * Output ring is full. Wait for a ring event. + * + * Note that the events from both queues + * are combined, so being woken does not + * guarantee that data exist in the read + * ring. + * + * To simplify error recovery and the retry, + * we specify PDROP so our lock is *not* held + * when msleep returns. + */ + error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, + "xbwrite", /*timeout*/0); + if (error && error != EWOULDBLOCK) + return (error); + + /* Try again. */ + continue; + } + mtx_unlock(&xs.ring_lock); + + /* Verify queue sanity. */ + if (!xs_check_indexes(cons, prod)) { + xen_store->req_cons = xen_store->req_prod = 0; + return (EIO); + } + + dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail); + if (avail > len) + avail = len; + + memcpy(dst, data, avail); + data += avail; + len -= avail; + + /* + * The store to the producer index, which indicates + * to the other side that new data has arrived, must + * be visible only after our copy of the data into the + * ring has completed. + */ + wmb(); + xen_store->req_prod += avail; + + /* + * notify_remote_via_evtchn implies mb(). The other side + * will see the change to req_prod at the time of the + * interrupt. + */ + notify_remote_via_evtchn(xs.evtchn); + } + + return (0); +} + +/** + * Receive data from the XenStore service. + * + * \param tdata A pointer to the contiguous buffer to receive the data. + * \param len The amount of data to receive. + * + * \return On success 0, otherwise an errno value indicating the + * cause of failure. + * + * \invariant Called from thread context. + * \invariant The buffer pointed to by tdata is at least len bytes + * in length. + * + * \note xs_read does not perform any internal locking to guarantee + * serial access to the incoming ring buffer. However, there + * is only one context processing reads: xs_rcv_thread(). + */ +static int +xs_read_store(void *tdata, unsigned len) +{ + XENSTORE_RING_IDX cons, prod; + char *data = (char *)tdata; + int error; + + while (len != 0) { + u_int avail; + const char *src; + + /* Hold lock so we can't miss wakeups should we block. */ + mtx_lock(&xs.ring_lock); + cons = xen_store->rsp_cons; + prod = xen_store->rsp_prod; + if (cons == prod) { + /* + * Nothing to read. Wait for a ring event. + * + * Note that the events from both queues + * are combined, so being woken does not + * guarantee that data exist in the read + * ring. + * + * To simplify error recovery and the retry, + * we specify PDROP so our lock is *not* held + * when msleep returns. + */ + error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, + "xbread", /*timout*/0); + if (error && error != EWOULDBLOCK) + return (error); + continue; + } + mtx_unlock(&xs.ring_lock); + + /* Verify queue sanity. */ + if (!xs_check_indexes(cons, prod)) { + xen_store->rsp_cons = xen_store->rsp_prod = 0; + return (EIO); + } + + src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail); + if (avail > len) + avail = len; + + /* + * Insure the data we read is related to the indexes + * we read above. + */ + rmb(); + + memcpy(data, src, avail); + data += avail; + len -= avail; + + /* + * Insure that the producer of this ring does not see + * the ring space as free until after we have copied it + * out. + */ + mb(); + xen_store->rsp_cons += avail; + + /* + * notify_remote_via_evtchn implies mb(). The producer + * will see the updated consumer index when the event + * is delivered. + */ + notify_remote_via_evtchn(xs.evtchn); + } + + return (0); +} + +/*----------------------- Received Message Processing ------------------------*/ +/** + * Block reading the next message from the XenStore service and + * process the result. + * + * \param type The returned type of the XenStore message received. + * + * \return 0 on success. Otherwise an errno value indicating the + * type of failure encountered. + */ +static int +xs_process_msg(enum xsd_sockmsg_type *type) +{ + struct xs_stored_msg *msg; + char *body; + int error; + + msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK); + error = xs_read_store(&msg->hdr, sizeof(msg->hdr)); + if (error) { + free(msg, M_XENSTORE); + return (error); + } + + body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK); + error = xs_read_store(body, msg->hdr.len); + if (error) { + free(body, M_XENSTORE); + free(msg, M_XENSTORE); + return (error); + } + body[msg->hdr.len] = '\0'; + + *type = msg->hdr.type; + if (msg->hdr.type == XS_WATCH_EVENT) { + msg->u.watch.vec = split(body, msg->hdr.len, + &msg->u.watch.vec_size); + + mtx_lock(&xs.registered_watches_lock); + msg->u.watch.handle = find_watch( + msg->u.watch.vec[XS_WATCH_TOKEN]); + if (msg->u.watch.handle != NULL) { + mtx_lock(&xs.watch_events_lock); + TAILQ_INSERT_TAIL(&xs.watch_events, msg, list); + wakeup(&xs.watch_events); + mtx_unlock(&xs.watch_events_lock); + } else { + free(msg->u.watch.vec, M_XENSTORE); + free(msg, M_XENSTORE); + } + mtx_unlock(&xs.registered_watches_lock); + } else { + msg->u.reply.body = body; + mtx_lock(&xs.reply_lock); + TAILQ_INSERT_TAIL(&xs.reply_list, msg, list); + wakeup(&xs.reply_list); + mtx_unlock(&xs.reply_lock); + } + + return (0); +} + +/** + * Thread body of the XenStore receive thread. + * + * This thread blocks waiting for data from the XenStore service + * and processes and received messages. + */ +static void +xs_rcv_thread(void *arg __unused) +{ + int error; + enum xsd_sockmsg_type type; + + for (;;) { + error = xs_process_msg(&type); + if (error) + printf("XENSTORE error %d while reading message\n", + error); + } +} + +/*---------------- XenStore Message Request/Reply Processing -----------------*/ +/** + * Filter invoked before transmitting any message to the XenStore service. + * + * The role of the filter may expand, but currently serves to manage + * the interactions of messages with transaction state. + * + * \param request_msg_type The message type for the request. + */ +static inline void +xs_request_filter(uint32_t request_msg_type) +{ + if (request_msg_type == XS_TRANSACTION_START) + sx_slock(&xs.suspend_mutex); +} + +/** + * Filter invoked after transmitting any message to the XenStore service. + * + * The role of the filter may expand, but currently serves to manage + * the interactions of messages with transaction state. + * + * \param request_msg_type The message type for the original request. + * \param reply_msg_type The message type for any received reply. + * \param request_reply_error The error status from the attempt to send + * the request or retrieve the reply. + */ +static inline void +xs_reply_filter(uint32_t request_msg_type, + uint32_t reply_msg_type, int request_reply_error) +{ + /* + * The count of transactions drops if we attempted + * to end a transaction (even if that attempt fails + * in error), we receive a transaction end acknowledgement + * or if our attempt to begin a transactionfails. + */ + if (request_msg_type == XS_TRANSACTION_END + || (request_reply_error == 0 && reply_msg_type == XS_TRANSACTION_END) + || (request_msg_type == XS_TRANSACTION_START + && (request_reply_error != 0 || reply_msg_type == XS_ERROR))) + sx_sunlock(&xs.suspend_mutex); + +} + +#define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0])) + +/** + * Convert a XenStore error string into an errno number. + * + * \param errorstring The error string to convert. + * + * \return The errno best matching the input string. + * + * \note Unknown error strings are converted to EINVAL. + */ +static int +xs_get_error(const char *errorstring) +{ + u_int i; + + for (i = 0; i < xsd_error_count; i++) { + if (!strcmp(errorstring, xsd_errors[i].errstring)) + return (xsd_errors[i].errnum); + } + log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s", + errorstring); + return (EINVAL); +} + +/** + * Block waiting for a reply to a message request. + * + * \param type The returned type of the reply. + * \param len The returned body length of the reply. + * \param result The returned body of the reply. + * + * \return 0 on success. Otherwise an errno indicating the + * cause of failure. + */ +static int +xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result) +{ + struct xs_stored_msg *msg; + char *body; + int error; + + mtx_lock(&xs.reply_lock); + while (TAILQ_EMPTY(&xs.reply_list)) { + error = mtx_sleep(&xs.reply_list, &xs.reply_lock, + PCATCH, "xswait", hz/10); + if (error && error != EWOULDBLOCK) { + mtx_unlock(&xs.reply_lock); + return (error); + } + } + msg = TAILQ_FIRST(&xs.reply_list); + TAILQ_REMOVE(&xs.reply_list, msg, list); + mtx_unlock(&xs.reply_lock); + + *type = msg->hdr.type; + if (len) + *len = msg->hdr.len; + body = msg->u.reply.body; + + free(msg, M_XENSTORE); + *result = body; + return (0); +} + +/** + * Pass-thru interface for XenStore access by userland processes + * via the XenStore device. + * + * Reply type and length data are returned by overwriting these + * fields in the passed in request message. + * + * \param msg A properly formatted message to transmit to + * the XenStore service. + * \param result The returned body of the reply. + * + * \return 0 on success. Otherwise an errno indicating the cause + * of failure. + * + * \note The returned result is provided in malloced storage and thus + * must be free'd by the caller with 'free(result, M_XENSTORE); + */ +int +xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result) +{ + uint32_t request_type; + int error; + + request_type = msg->type; + xs_request_filter(request_type); + + sx_xlock(&xs.request_mutex); + if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0) + error = xs_read_reply(&msg->type, &msg->len, result); + sx_xunlock(&xs.request_mutex); + + xs_reply_filter(request_type, msg->type, error); + + return (error); +} + +/** + * Send a message with an optionally muti-part body to the XenStore service. + * + * \param t The transaction to use for this request. + * \param request_type The type of message to send. + * \param iovec Pointers to the body sections of the request. + * \param num_vecs The number of body sections in the request. + * \param len The returned length of the reply. + * \param result The returned body of the reply. + * + * \return 0 on success. Otherwise an errno indicating + * the cause of failure. + * + * \note The returned result is provided in malloced storage and thus + * must be free'd by the caller with 'free(*result, M_XENSTORE); + */ +static int +xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type, + const struct iovec *iovec, u_int num_vecs, u_int *len, void **result) +{ + struct xsd_sockmsg msg; + void *ret = NULL; + u_int i; + int error; + + msg.tx_id = t.id; + msg.req_id = 0; + msg.type = request_type; + msg.len = 0; + for (i = 0; i < num_vecs; i++) + msg.len += iovec[i].iov_len; + + xs_request_filter(request_type); + + sx_xlock(&xs.request_mutex); + error = xs_write_store(&msg, sizeof(msg)); + if (error) { + printf("xs_talkv failed %d\n", error); + goto error_lock_held; + } + + for (i = 0; i < num_vecs; i++) { + error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len); + if (error) { + printf("xs_talkv failed %d\n", error); + goto error_lock_held; + } + } + + error = xs_read_reply(&msg.type, len, &ret); + +error_lock_held: + sx_xunlock(&xs.request_mutex); + xs_reply_filter(request_type, msg.type, error); + if (error) + return (error); + + if (msg.type == XS_ERROR) { + error = xs_get_error(ret); + free(ret, M_XENSTORE); + return (error); + } + + /* Reply is either error or an echo of our request message type. */ + KASSERT(msg.type == request_type, ("bad xenstore message type")); + + if (result) + *result = ret; + else + free(ret, M_XENSTORE); + + return (0); +} + +/** + * Wrapper for xs_talkv allowing easy transmission of a message with + * a single, contiguous, message body. + * + * \param t The transaction to use for this request. + * \param request_type The type of message to send. + * \param body The body of the request. + * \param len The returned length of the reply. + * \param result The returned body of the reply. + * + * \return 0 on success. Otherwise an errno indicating + * the cause of failure. + * + * \note The returned result is provided in malloced storage and thus + * must be free'd by the caller with 'free(*result, M_XENSTORE); + */ +static int +xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type, + const char *body, u_int *len, void **result) +{ + struct iovec iovec; + + iovec.iov_base = (void *)(uintptr_t)body; + iovec.iov_len = strlen(body) + 1; + + return (xs_talkv(t, request_type, &iovec, 1, len, result)); +} + +/*------------------------- XenStore Watch Support ---------------------------*/ +/** + * Transmit a watch request to the XenStore service. + * + * \param path The path in the XenStore to watch. + * \param tocken A unique identifier for this watch. + * + * \return 0 on success. Otherwise an errno indicating the + * cause of failure. + */ +static int +xs_watch(const char *path, const char *token) +{ + struct iovec iov[2]; + + iov[0].iov_base = (void *)(uintptr_t) path; + iov[0].iov_len = strlen(path) + 1; + iov[1].iov_base = (void *)(uintptr_t) token; + iov[1].iov_len = strlen(token) + 1; + + return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL)); +} + +/** + * Transmit an uwatch request to the XenStore service. + * + * \param path The path in the XenStore to watch. + * \param tocken A unique identifier for this watch. + * + * \return 0 on success. Otherwise an errno indicating the + * cause of failure. + */ +static int +xs_unwatch(const char *path, const char *token) +{ + struct iovec iov[2]; + + iov[0].iov_base = (void *)(uintptr_t) path; + iov[0].iov_len = strlen(path) + 1; + iov[1].iov_base = (void *)(uintptr_t) token; + iov[1].iov_len = strlen(token) + 1; + + return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL)); +} + +/** + * Convert from watch token (unique identifier) to the associated + * internal tracking structure for this watch. + * + * \param tocken The unique identifier for the watch to find. + * + * \return A pointer to the found watch structure or NULL. + */ +static struct xs_watch * +find_watch(const char *token) +{ + struct xs_watch *i, *cmp; + + cmp = (void *)strtoul(token, NULL, 16); + + LIST_FOREACH(i, &xs.registered_watches, list) + if (i == cmp) + return (i); + + return (NULL); +} + +/** + * Thread body of the XenStore watch event dispatch thread. + */ +static void +xenwatch_thread(void *unused) +{ + struct xs_stored_msg *msg; + + for (;;) { + + mtx_lock(&xs.watch_events_lock); + while (TAILQ_EMPTY(&xs.watch_events)) + mtx_sleep(&xs.watch_events, + &xs.watch_events_lock, + PWAIT | PCATCH, "waitev", hz/10); + + mtx_unlock(&xs.watch_events_lock); + sx_xlock(&xs.xenwatch_mutex); + + mtx_lock(&xs.watch_events_lock); + msg = TAILQ_FIRST(&xs.watch_events); + if (msg) + TAILQ_REMOVE(&xs.watch_events, msg, list); + mtx_unlock(&xs.watch_events_lock); + + if (msg != NULL) { + /* + * XXX There are messages coming in with a NULL + * XXX callback. This deserves further investigation; + * XXX the workaround here simply prevents the kernel + * XXX from panic'ing on startup. + */ + if (msg->u.watch.handle->callback != NULL) + msg->u.watch.handle->callback( + msg->u.watch.handle, + (const char **)msg->u.watch.vec, + msg->u.watch.vec_size); + free(msg->u.watch.vec, M_XENSTORE); + free(msg, M_XENSTORE); + } + + sx_xunlock(&xs.xenwatch_mutex); + } +} + +/*----------- XenStore Configuration, Initialization, and Control ------------*/ +/** + * Setup communication channels with the XenStore service. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +static int +xs_init_comms(void) +{ + int error; + + if (xen_store->rsp_prod != xen_store->rsp_cons) { + log(LOG_WARNING, "XENSTORE response ring is not quiescent " + "(%08x:%08x): fixing up\n", + xen_store->rsp_cons, xen_store->rsp_prod); + xen_store->rsp_cons = xen_store->rsp_prod; + } + + if (xs.irq) + unbind_from_irqhandler(xs.irq); + + error = bind_caller_port_to_irqhandler(xs.evtchn, "xenstore", + xs_intr, NULL, INTR_TYPE_NET, &xs.irq); + if (error) { + log(LOG_WARNING, "XENSTORE request irq failed %i\n", error); + return (error); + } + + return (0); +} + +/*------------------ Private Device Attachment Functions --------------------*/ +static void +xs_identify(driver_t *driver, device_t parent) +{ + + BUS_ADD_CHILD(parent, 0, "xenstore", 0); +} + +/** + * Probe for the existance of the XenStore. + * + * \param dev + */ +static int +xs_probe(device_t dev) +{ + /* + * We are either operating within a PV kernel or being probed + * as the child of the successfully attached xenpci device. + * Thus we are in a Xen environment and there will be a XenStore. + * Uncontitionally return success. + */ + device_set_desc(dev, "XenStore"); +printf("xs_probe: Probe retuns 0\n"); + return (0); +} + +static void +xs_attach_deferred(void *arg) +{ + xs_dev_init(); + + bus_generic_probe(xs.xs_dev); + bus_generic_attach(xs.xs_dev); + + config_intrhook_disestablish(&xs.xs_attachcb); +} + +/** + * Attach to the XenStore. + * + * This routine also prepares for the probe/attach of drivers that rely + * on the XenStore. + */ +static int +xs_attach(device_t dev) +{ + int error; + + /* Allow us to get device_t from softc and vice-versa. */ + xs.xs_dev = dev; + device_set_softc(dev, &xs); + + /* + * This seems to be a layering violation. The XenStore is just + * one of many clients of the Grant Table facility. It happens + * to be the first and a gating consumer to all other devices, + * so this does work. A better place would be in the PV support + * code for fully PV kernels and the xenpci driver for HVM kernels. + */ + error = gnttab_init(); + if (error != 0) { + log(LOG_WARNING, + "XENSTORE: Error initializing grant tables: %d\n", error); + return (ENXIO); + } + + /* Initialize the interface to xenstore. */ + struct proc *p; + +#ifdef XENHVM + xs.evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN); + xs.gpfn = hvm_get_parameter(HVM_PARAM_STORE_PFN); + xen_store = pmap_mapdev(xs.gpfn * PAGE_SIZE, PAGE_SIZE); +#else + xs.evtchn = xen_start_info->store_evtchn; +#endif + + TAILQ_INIT(&xs.reply_list); + TAILQ_INIT(&xs.watch_events); + + mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF); + mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF); + sx_init(&xs.xenwatch_mutex, "xenwatch"); + sx_init(&xs.request_mutex, "xenstore request"); + sx_init(&xs.suspend_mutex, "xenstore suspend"); + mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF); + mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF); + xs.irq = 0; + + /* Initialize the shared memory rings to talk to xenstored */ + error = xs_init_comms(); + if (error) + return (error); + + error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID, + 0, "xenwatch"); + if (error) + return (error); + xs.xenwatch_pid = p->p_pid; + + error = kproc_create(xs_rcv_thread, NULL, NULL, + RFHIGHPID, 0, "xenstore_rcv"); + + xs.xs_attachcb.ich_func = xs_attach_deferred; + xs.xs_attachcb.ich_arg = NULL; + config_intrhook_establish(&xs.xs_attachcb); + + return (error); +} + +/** + * Prepare for suspension of this VM by halting XenStore access after + * all transactions and individual requests have completed. + */ +static int +xs_suspend(device_t dev __unused) +{ + + sx_xlock(&xs.suspend_mutex); + sx_xlock(&xs.request_mutex); + + return (0); +} + +/** + * Resume XenStore operations after this VM is resumed. + */ +static int +xs_resume(device_t dev __unused) +{ + struct xs_watch *watch; + char token[sizeof(watch) * 2 + 1]; + + xs_init_comms(); + + sx_xunlock(&xs.request_mutex); + + /* + * No need for registered_watches_lock: the suspend_mutex + * is sufficient. + */ + LIST_FOREACH(watch, &xs.registered_watches, list) { + sprintf(token, "%lX", (long)watch); + xs_watch(watch->node, token); + } + + sx_xunlock(&xs.suspend_mutex); + + return (0); +} + +/*-------------------- Private Device Attachment Data -----------------------*/ +static device_method_t xenstore_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xs_identify), + DEVMETHOD(device_probe, xs_probe), + DEVMETHOD(device_attach, xs_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, xs_suspend), + DEVMETHOD(device_resume, xs_resume), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + + { 0, 0 } +}; + +DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0); +static devclass_t xenstore_devclass; + +#ifdef XENHVM +DRIVER_MODULE(xenstore, xenpci, xenstore_driver, xenstore_devclass, 0, 0); +#else +DRIVER_MODULE(xenstore, nexus, xenstore_driver, xenstore_devclass, 0, 0); +#endif + +/*------------------------------- Sysctl Data --------------------------------*/ +/* XXX Shouldn't the node be somewhere else? */ +SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen"); +SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, ""); +SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, ""); + +/*-------------------------------- Public API --------------------------------*/ +/*------- API comments for these methods can be found in xenstorevar.h -------*/ +int +xs_directory(struct xs_transaction t, const char *dir, const char *node, + u_int *num, const char ***result) +{ + struct sbuf *path; + char *strings; + u_int len = 0; + int error; + + path = xs_join(dir, node); + error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len, + (void **)&strings); + sbuf_delete(path); + if (error) + return (error); + + *result = split(strings, len, num); + + return (0); +} + +int +xs_exists(struct xs_transaction t, const char *dir, const char *node) +{ + const char **d; + int error, dir_n; + + error = xs_directory(t, dir, node, &dir_n, &d); + if (error) + return (0); + free(d, M_XENSTORE); + return (1); +} + +int +xs_read(struct xs_transaction t, const char *dir, const char *node, + u_int *len, void **result) +{ + struct sbuf *path; + void *ret; + int error; + + path = xs_join(dir, node); + error = xs_single(t, XS_READ, sbuf_data(path), len, &ret); + sbuf_delete(path); + if (error) + return (error); + *result = ret; + return (0); +} + +int +xs_write(struct xs_transaction t, const char *dir, const char *node, + const char *string) +{ + struct sbuf *path; + struct iovec iovec[2]; + int error; + + path = xs_join(dir, node); + + iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path); + iovec[0].iov_len = sbuf_len(path) + 1; + iovec[1].iov_base = (void *)(uintptr_t) string; + iovec[1].iov_len = strlen(string); + + error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL); + sbuf_delete(path); + + return (error); +} + +int +xs_mkdir(struct xs_transaction t, const char *dir, const char *node) +{ + struct sbuf *path; + int ret; + + path = xs_join(dir, node); + ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL); + sbuf_delete(path); + + return (ret); +} + +int +xs_rm(struct xs_transaction t, const char *dir, const char *node) +{ + struct sbuf *path; + int ret; + + path = xs_join(dir, node); + ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL); + sbuf_delete(path); + + return (ret); +} + +int +xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node) +{ + struct xs_transaction local_xbt; + struct sbuf *root_path_sbuf; + struct sbuf *cur_path_sbuf; + char *root_path; + char *cur_path; + const char **dir; + int error; + int empty; + +retry: + root_path_sbuf = xs_join(base, node); + cur_path_sbuf = xs_join(base, node); + root_path = sbuf_data(root_path_sbuf); + cur_path = sbuf_data(cur_path_sbuf); + dir = NULL; + local_xbt.id = 0; + + if (xbt.id == 0) { + error = xs_transaction_start(&local_xbt); + if (error != 0) + goto out; + xbt = local_xbt; + } + + empty = 0; + while (1) { + u_int count; + u_int i; + + error = xs_directory(xbt, cur_path, "", &count, &dir); + if (error) + goto out; + + for (i = 0; i < count; i++) { + error = xs_rm(xbt, cur_path, dir[i]); + if (error == ENOTEMPTY) { + struct sbuf *push_dir; + + /* + * Descend to clear out this sub directory. + * We'll return to cur_dir once push_dir + * is empty. + */ + push_dir = xs_join(cur_path, dir[i]); + sbuf_delete(cur_path_sbuf); + cur_path_sbuf = push_dir; + cur_path = sbuf_data(cur_path_sbuf); + break; + } else if (error != 0) { + goto out; + } + } + + free(dir, M_XENSTORE); + dir = NULL; + + if (i == count) { + char *last_slash; + + /* Directory is empty. It is now safe to remove. */ + error = xs_rm(xbt, cur_path, ""); + if (error != 0) + goto out; + + if (!strcmp(cur_path, root_path)) + break; + + /* Return to processing the parent directory. */ + last_slash = strrchr(cur_path, '/'); + KASSERT(last_slash != NULL, + ("xs_rm_tree: mangled path %s", cur_path)); + *last_slash = '\0'; + } + } + +out: + sbuf_delete(cur_path_sbuf); + sbuf_delete(root_path_sbuf); + if (dir != NULL) + free(dir, M_XENSTORE); + + if (local_xbt.id != 0) { + int terror; + + terror = xs_transaction_end(local_xbt, /*abort*/error != 0); + xbt.id = 0; + if (terror == EAGAIN && error == 0) + goto retry; + } + return (error); +} + +int +xs_transaction_start(struct xs_transaction *t) +{ + char *id_str; + int error; + + error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL, + (void **)&id_str); + if (error == 0) { + t->id = strtoul(id_str, NULL, 0); + free(id_str, M_XENSTORE); + } + return (error); +} + +int +xs_transaction_end(struct xs_transaction t, int abort) +{ + char abortstr[2]; + + if (abort) + strcpy(abortstr, "F"); + else + strcpy(abortstr, "T"); + + return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL)); +} + +int +xs_scanf(struct xs_transaction t, const char *dir, const char *node, + int *scancountp, const char *fmt, ...) +{ + va_list ap; + int error, ns; + char *val; + + error = xs_read(t, dir, node, NULL, (void **) &val); + if (error) + return (error); + + va_start(ap, fmt); + ns = vsscanf(val, fmt, ap); + va_end(ap); + free(val, M_XENSTORE); + /* Distinctive errno. */ + if (ns == 0) + return (ERANGE); + if (scancountp) + *scancountp = ns; + return (0); +} + +int +xs_vprintf(struct xs_transaction t, + const char *dir, const char *node, const char *fmt, va_list ap) +{ + struct sbuf *sb; + int error; + + sb = sbuf_new_auto(); + sbuf_vprintf(sb, fmt, ap); + sbuf_finish(sb); + error = xs_write(t, dir, node, sbuf_data(sb)); + sbuf_delete(sb); + + return (error); +} + +int +xs_printf(struct xs_transaction t, const char *dir, const char *node, + const char *fmt, ...) +{ + va_list ap; + int error; + + va_start(ap, fmt); + error = xs_vprintf(t, dir, node, fmt, ap); + va_end(ap); + + return (error); +} + +int +xs_gather(struct xs_transaction t, const char *dir, ...) +{ + va_list ap; + const char *name; + int error; + + va_start(ap, dir); + error = 0; + while (error == 0 && (name = va_arg(ap, char *)) != NULL) { + const char *fmt = va_arg(ap, char *); + void *result = va_arg(ap, void *); + char *p; + + error = xs_read(t, dir, name, NULL, (void **) &p); + if (error) + break; + + if (fmt) { + if (sscanf(p, fmt, result) == 0) + error = EINVAL; + free(p, M_XENSTORE); + } else + *(char **)result = p; + } + va_end(ap); + + return (error); +} + +int +xs_register_watch(struct xs_watch *watch) +{ + /* Pointer in ascii is the token. */ + char token[sizeof(watch) * 2 + 1]; + int error; + + sprintf(token, "%lX", (long)watch); + + sx_slock(&xs.suspend_mutex); + + mtx_lock(&xs.registered_watches_lock); + KASSERT(find_watch(token) == NULL, ("watch already registered")); + LIST_INSERT_HEAD(&xs.registered_watches, watch, list); + mtx_unlock(&xs.registered_watches_lock); + + error = xs_watch(watch->node, token); + + /* Ignore errors due to multiple registration. */ + if (error == EEXIST) + error = 0; + + if (error != 0) { + mtx_lock(&xs.registered_watches_lock); + LIST_REMOVE(watch, list); + mtx_unlock(&xs.registered_watches_lock); + } + + sx_sunlock(&xs.suspend_mutex); + + return (error); +} + +void +xs_unregister_watch(struct xs_watch *watch) +{ + struct xs_stored_msg *msg, *tmp; + char token[sizeof(watch) * 2 + 1]; + int error; + + sprintf(token, "%lX", (long)watch); + + sx_slock(&xs.suspend_mutex); + + mtx_lock(&xs.registered_watches_lock); + if (find_watch(token) == NULL) { + mtx_unlock(&xs.registered_watches_lock); + sx_sunlock(&xs.suspend_mutex); + return; + } + LIST_REMOVE(watch, list); + mtx_unlock(&xs.registered_watches_lock); + + error = xs_unwatch(watch->node, token); + if (error) + log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n", + watch->node, error); + + sx_sunlock(&xs.suspend_mutex); + + /* Cancel pending watch events. */ + mtx_lock(&xs.watch_events_lock); + TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) { + if (msg->u.watch.handle != watch) + continue; + TAILQ_REMOVE(&xs.watch_events, msg, list); + free(msg->u.watch.vec, M_XENSTORE); + free(msg, M_XENSTORE); + } + mtx_unlock(&xs.watch_events_lock); + + /* Flush any currently-executing callback, unless we are it. :-) */ + if (curproc->p_pid != xs.xenwatch_pid) { + sx_xlock(&xs.xenwatch_mutex); + sx_xunlock(&xs.xenwatch_mutex); + } +} diff --git a/sys/xen/xenbus/xenbus_dev.c b/sys/xen/xenstore/xenstore_dev.c index ac3f103..1fa4197 100644 --- a/sys/xen/xenbus/xenbus_dev.c +++ b/sys/xen/xenstore/xenstore_dev.c @@ -1,8 +1,8 @@ /* - * xenbus_dev.c + * xenstore_dev.c * - * Driver giving user-space access to the kernel's xenbus connection - * to xenstore. + * Driver giving user-space access to the kernel's connection to the + * XenStore service. * * Copyright (c) 2005, Christian Limpach * Copyright (c) 2005, Rusty Russell, IBM Corporation @@ -45,18 +45,19 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <machine/xen/xen-os.h> + #include <xen/hypervisor.h> -#include <xen/xenbus/xenbusvar.h> -#include <xen/xenbus/xenbus_comms.h> +#include <xen/xenstore/xenstorevar.h> +#include <xen/xenstore/xenstore_internal.h> -struct xenbus_dev_transaction { - LIST_ENTRY(xenbus_dev_transaction) list; - struct xenbus_transaction handle; +struct xs_dev_transaction { + LIST_ENTRY(xs_dev_transaction) list; + struct xs_transaction handle; }; -struct xenbus_dev_data { +struct xs_dev_data { /* In-progress transaction. */ - LIST_HEAD(xdd_list_head, xenbus_dev_transaction) transactions; + LIST_HEAD(xdd_list_head, xs_dev_transaction) transactions; /* Partial request. */ unsigned int len; @@ -72,13 +73,13 @@ struct xenbus_dev_data { }; static int -xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag) +xs_dev_read(struct cdev *dev, struct uio *uio, int ioflag) { int error; - struct xenbus_dev_data *u = dev->si_drv1; + struct xs_dev_data *u = dev->si_drv1; while (u->read_prod == u->read_cons) { - error = tsleep(u, PCATCH, "xbdread", hz/10); + error = tsleep(u, PCATCH, "xsdread", hz/10); if (error && error != EWOULDBLOCK) return (error); } @@ -96,7 +97,7 @@ xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag) } static void -queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len) +xs_queue_reply(struct xs_dev_data *u, char *data, unsigned int len) { int i; @@ -110,11 +111,11 @@ queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len) } static int -xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag) +xs_dev_write(struct cdev *dev, struct uio *uio, int ioflag) { int error; - struct xenbus_dev_data *u = dev->si_drv1; - struct xenbus_dev_transaction *trans; + struct xs_dev_data *u = dev->si_drv1; + struct xs_dev_transaction *trans; void *reply; int len = uio->uio_resid; @@ -141,10 +142,10 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag) case XS_MKDIR: case XS_RM: case XS_SET_PERMS: - error = xenbus_dev_request_and_reply(&u->u.msg, &reply); + error = xs_dev_request_and_reply(&u->u.msg, &reply); if (!error) { if (u->u.msg.type == XS_TRANSACTION_START) { - trans = malloc(sizeof(*trans), M_DEVBUF, + trans = malloc(sizeof(*trans), M_XENSTORE, M_WAITOK); trans->handle.id = strtoul(reply, NULL, 0); LIST_INSERT_HEAD(&u->transactions, trans, list); @@ -156,11 +157,11 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag) BUG_ON(&trans->list == &u->transactions); #endif LIST_REMOVE(trans, list); - free(trans, M_DEVBUF); + free(trans, M_XENSTORE); } - queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); - queue_reply(u, (char *)reply, u->u.msg.len); - free(reply, M_DEVBUF); + xs_queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); + xs_queue_reply(u, (char *)reply, u->u.msg.len); + free(reply, M_XENSTORE); } break; @@ -176,16 +177,14 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag) } static int -xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +xs_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { - struct xenbus_dev_data *u; + struct xs_dev_data *u; - if (xen_store_evtchn == 0) - return (ENOENT); #if 0 /* XXX figure out if equiv needed */ nonseekable_open(inode, filp); #endif - u = malloc(sizeof(*u), M_DEVBUF, M_WAITOK|M_ZERO); + u = malloc(sizeof(*u), M_XENSTORE, M_WAITOK|M_ZERO); LIST_INIT(&u->transactions); dev->si_drv1 = u; @@ -193,37 +192,33 @@ xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) } static int -xenbus_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +xs_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { - struct xenbus_dev_data *u = dev->si_drv1; - struct xenbus_dev_transaction *trans, *tmp; + struct xs_dev_data *u = dev->si_drv1; + struct xs_dev_transaction *trans, *tmp; LIST_FOREACH_SAFE(trans, &u->transactions, list, tmp) { - xenbus_transaction_end(trans->handle, 1); + xs_transaction_end(trans->handle, 1); LIST_REMOVE(trans, list); - free(trans, M_DEVBUF); + free(trans, M_XENSTORE); } - free(u, M_DEVBUF); + free(u, M_XENSTORE); return (0); } -static struct cdevsw xenbus_dev_cdevsw = { +static struct cdevsw xs_dev_cdevsw = { .d_version = D_VERSION, - .d_read = xenbus_dev_read, - .d_write = xenbus_dev_write, - .d_open = xenbus_dev_open, - .d_close = xenbus_dev_close, - .d_name = "xenbus_dev", + .d_read = xs_dev_read, + .d_write = xs_dev_write, + .d_open = xs_dev_open, + .d_close = xs_dev_close, + .d_name = "xs_dev", }; -static int -xenbus_dev_sysinit(void) +void +xs_dev_init() { - make_dev(&xenbus_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400, - "xen/xenbus"); - - return (0); + make_dev(&xs_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400, + "xen/xenstore"); } -SYSINIT(xenbus_dev_sysinit, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, - xenbus_dev_sysinit, NULL); diff --git a/sys/xen/xenstore/xenstore_internal.h b/sys/xen/xenstore/xenstore_internal.h new file mode 100644 index 0000000..0398aef --- /dev/null +++ b/sys/xen/xenstore/xenstore_internal.h @@ -0,0 +1,39 @@ +/*- + * Core definitions and data structures shareable across OS platforms. + * + * Copyright (c) 2010 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ + +/* Initialize support for userspace access to the XenStore. */ +void xs_dev_init(void); + +/* Used by the XenStore character device to borrow kernel's store connection. */ +int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result); diff --git a/sys/xen/xenstore/xenstorevar.h b/sys/xen/xenstore/xenstorevar.h new file mode 100644 index 0000000..df41e31 --- /dev/null +++ b/sys/xen/xenstore/xenstorevar.h @@ -0,0 +1,338 @@ +/****************************************************************************** + * xenstorevar.h + * + * Method declarations and structures for accessing the XenStore.h + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * Copyright (C) 2009,2010 Spectra Logic Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef _XEN_XENSTORE_XENSTOREVAR_H +#define _XEN_XENSTORE_XENSTOREVAR_H + +#include <sys/queue.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/malloc.h> +#include <sys/sbuf.h> + +#include <machine/stdarg.h> +#include <machine/xen/xen-os.h> + +#include <xen/interface/grant_table.h> +#include <xen/interface/io/xenbus.h> +#include <xen/interface/io/xs_wire.h> + +#include "xenbus_if.h" + +/* XenStore allocations including XenStore data returned to clients. */ +MALLOC_DECLARE(M_XENSTORE); + +struct xenstore_domain_interface; +struct xs_watch; +extern struct xenstore_domain_interface *xen_store; + +typedef void (xs_watch_cb_t)(struct xs_watch *, + const char **vec, unsigned int len); + +/* Register callback to watch subtree (node) in the XenStore. */ +struct xs_watch +{ + LIST_ENTRY(xs_watch) list; + + /* Path being watched. */ + char *node; + + /* Callback (executed in a process context with no locks held). */ + xs_watch_cb_t *callback; +}; +LIST_HEAD(xs_watch_list, xs_watch); + +typedef int (*xs_event_handler_t)(void *); + +struct xs_transaction +{ + uint32_t id; +}; + +#define XST_NIL ((struct xs_transaction) { 0 }) + +/** + * Fetch the contents of a directory in the XenStore. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the path to read. + * \param node The basename of the path to read. + * \param num The returned number of directory entries. + * \param result An array of directory entry strings. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * \note The results buffer is malloced and should be free'd by the + * caller with 'free(*result, M_XENSTORE)'. + */ +int xs_directory(struct xs_transaction t, const char *dir, + const char *node, unsigned int *num, const char ***result); + +/** + * Determine if a path exists in the XenStore. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the path to read. + * \param node The basename of the path to read. + * + * \retval 1 The path exists. + * \retval 0 The path does not exist or an error occurred attempting + * to make that determination. + */ +int xs_exists(struct xs_transaction t, const char *dir, const char *node); + +/** + * Get the contents of a single "file". Returns the contents in + * *result which should be freed with free(*result, M_XENSTORE) after + * use. The length of the value in bytes is returned in *len. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the file to read. + * \param node The basename of the file to read. + * \param len The amount of data read. + * \param result The returned contents from this file. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + * + * \note The results buffer is malloced and should be free'd by the + * caller with 'free(*result, M_XENSTORE)'. + */ +int xs_read(struct xs_transaction t, const char *dir, + const char *node, unsigned int *len, void **result); + +/** + * Write to a single file. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the file to write. + * \param node The basename of the file to write. + * \param string The NUL terminated string of data to write. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_write(struct xs_transaction t, const char *dir, + const char *node, const char *string); + +/** + * Create a new directory. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the directory to create. + * \param node The basename of the directory to create. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_mkdir(struct xs_transaction t, const char *dir, + const char *node); + +/** + * Remove a file or directory (directories must be empty). + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the directory to remove. + * \param node The basename of the directory to remove. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_rm(struct xs_transaction t, const char *dir, const char *node); + +/** + * Destroy a tree of files rooted at dir/node. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the directory to remove. + * \param node The basename of the directory to remove. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_rm_tree(struct xs_transaction t, const char *dir, + const char *node); + +/** + * Start a transaction. + * + * Changes by others will not be seen during the lifetime of this + * transaction, and changes will not be visible to others until it + * is committed (xs_transaction_end). + * + * \param t The returned transaction. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_transaction_start(struct xs_transaction *t); + +/** + * End a transaction. + * + * \param t The transaction to end/commit. + * \param abort If non-zero, the transaction is discarded + * instead of committed. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_transaction_end(struct xs_transaction t, int abort); + +/* + * Single file read and scanf parsing of the result. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the path to read. + * \param node The basename of the path to read. + * \param scancountp The number of input values assigned (i.e. the result + * of scanf). + * \param fmt Scanf format string followed by a variable number of + * scanf input arguments. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of failure. + */ +int xs_scanf(struct xs_transaction t, + const char *dir, const char *node, int *scancountp, const char *fmt, ...) + __attribute__((format(scanf, 5, 6))); + +/** + * Printf formatted write to a XenStore file. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the path to read. + * \param node The basename of the path to read. + * \param fmt Printf format string followed by a variable number of + * printf arguments. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of write failure. + */ +int xs_printf(struct xs_transaction t, const char *dir, + const char *node, const char *fmt, ...) + __attribute__((format(printf, 4, 5))); + +/** + * va_list version of xenbus_printf(). + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the path to read. + * \param node The basename of the path to read. + * \param fmt Printf format string. + * \param ap Va_list of printf arguments. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of write failure. + */ +int xs_vprintf(struct xs_transaction t, const char *dir, + const char *node, const char *fmt, va_list ap); + +/** + * Multi-file read within a single directory and scanf parsing of + * the results. + * + * \param t The XenStore transaction covering this request. + * \param dir The dirname of the paths to read. + * \param ... A variable number of argument triples specifying + * the file name, scanf-style format string, and + * output variable (pointer to storage of the results). + * The last triple in the call must be terminated + * will a final NULL argument. A NULL format string + * will cause the entire contents of the given file + * to be assigned as a NUL terminated, M_XENSTORE heap + * backed, string to the output parameter of that tuple. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of read failure. + * + * Example: + * char protocol_abi[64]; + * uint32_t ring_ref; + * char *dev_type; + * int error; + * + * error = xenbus_gather(XBT_NIL, xenbus_get_node(dev), + * "ring-ref", "%" PRIu32, &ring_ref, + * "protocol", "%63s", protocol_abi, + * "device-type", NULL, &dev_type, + * NULL); + * + * ... + * + * free(dev_type, M_XENSTORE); + */ +int xs_gather(struct xs_transaction t, const char *dir, ...); + +/** + * Register a XenStore watch. + * + * XenStore watches allow a client to be notified via a callback (embedded + * within the watch object) of changes to an object in the XenStore. + * + * \param watch A xenbus_watch struct with it's node and callback fields + * properly initialized. + * + * \return On success, 0. Otherwise an errno value indicating the + * type of write failure. EEXIST errors from the XenStore + * are supressed, allowing multiple, physically different, + * xenbus_watch objects, to watch the same path in the XenStore. + */ +int xs_register_watch(struct xs_watch *watch); + +/** + * Unregister a XenStore watch. + * + * \param watch An xs_watch object previously used in a successful call + * to xs_register_watch(). + * + * The xs_watch object's node field is not altered by this call. + * It is the caller's responsibility to properly dispose of both the + * watch object and the data pointed to by watch->node. + */ +void xs_unregister_watch(struct xs_watch *watch); + +/** + * Allocate and return an sbuf containing the XenStore path string + * <dir>/<name>. If name is the NUL string, the returned sbuf contains + * the path string <dir>. + * + * \param dir The NUL terminated directory prefix for new path. + * \param name The NUL terminated basename for the new path. + * + * \return A buffer containing the joined path. + */ +struct sbuf *xs_join(const char *, const char *); + +#endif /* _XEN_XENSTORE_XENSTOREVAR_H */ |