summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/boot/common/gpt.c4
-rw-r--r--sys/boot/i386/boot2/boot2.c4
-rw-r--r--sys/conf/files19
-rw-r--r--sys/dev/acpica/acpi_pci.c2
-rw-r--r--sys/dev/bge/if_bge.c55
-rw-r--r--sys/dev/bge/if_bgereg.h3
-rw-r--r--sys/dev/iwi/if_iwi.c4
-rw-r--r--sys/dev/mfi/mfireg.h4
-rw-r--r--sys/dev/mvs/mvs.c125
-rw-r--r--sys/dev/mvs/mvs_pci.c1
-rw-r--r--sys/dev/mvs/mvs_soc.c1
-rw-r--r--sys/dev/pci/pci.c62
-rw-r--r--sys/dev/pci/pci_pci.c2
-rw-r--r--sys/dev/pci/pci_private.h1
-rw-r--r--sys/dev/pci/pcireg.h32
-rw-r--r--sys/dev/pci/pcivar.h6
-rw-r--r--sys/dev/sis/if_sis.c13
-rw-r--r--sys/dev/xen/balloon/balloon.c10
-rw-r--r--sys/dev/xen/blkback/blkback.c3663
-rw-r--r--sys/dev/xen/blkfront/blkfront.c596
-rw-r--r--sys/dev/xen/blkfront/block.h64
-rw-r--r--sys/dev/xen/control/control.c493
-rw-r--r--sys/dev/xen/netfront/netfront.c72
-rw-r--r--sys/dev/xen/xenpci/evtchn.c43
-rw-r--r--sys/dev/xen/xenpci/xenpci.c81
-rw-r--r--sys/fs/nfsserver/nfs_nfsdport.c10
-rw-r--r--sys/geom/eli/g_eli.c258
-rw-r--r--sys/geom/eli/g_eli.h8
-rw-r--r--sys/geom/eli/g_eli_ctl.c215
-rw-r--r--sys/geom/eli/g_eli_integrity.c13
-rw-r--r--sys/geom/eli/g_eli_privacy.c61
-rw-r--r--sys/i386/xen/xen_machdep.c6
-rw-r--r--sys/kern/kern_exec.c12
-rw-r--r--sys/kern/kern_exit.c32
-rw-r--r--sys/kern/kern_fork.c16
-rw-r--r--sys/kern/kern_ktrace.c126
-rw-r--r--sys/kern/kern_syscalls.c9
-rw-r--r--sys/kern/kern_thr.c23
-rw-r--r--sys/mips/rmi/board.c12
-rw-r--r--sys/mips/rmi/dev/nlge/if_nlge.c2
-rw-r--r--sys/mips/rmi/xlr_machdep.c10
-rw-r--r--sys/net/if.c46
-rw-r--r--sys/net/if.h10
-rw-r--r--sys/sys/ktrace.h4
-rw-r--r--sys/sys/sockio.h3
-rw-r--r--sys/vm/swap_pager.c4
-rw-r--r--sys/vm/vm_map.c6
-rw-r--r--sys/vm/vm_map.h2
-rw-r--r--sys/xen/blkif.h145
-rw-r--r--sys/xen/evtchn/evtchn.c6
-rw-r--r--sys/xen/gnttab.c80
-rw-r--r--sys/xen/gnttab.h9
-rw-r--r--sys/xen/interface/grant_table.h2
-rw-r--r--sys/xen/interface/hvm/params.h26
-rw-r--r--sys/xen/interface/io/blkif.h31
-rw-r--r--sys/xen/interface/io/protocols.h3
-rw-r--r--sys/xen/interface/io/ring.h18
-rw-r--r--sys/xen/interface/io/xenbus.h9
-rw-r--r--sys/xen/reboot.c266
-rw-r--r--sys/xen/xen_intr.h2
-rw-r--r--sys/xen/xenbus/init.txt14
-rw-r--r--sys/xen/xenbus/xenbus.c (renamed from sys/xen/xenbus/xenbus_client.c)141
-rw-r--r--sys/xen/xenbus/xenbus_comms.c226
-rw-r--r--sys/xen/xenbus/xenbus_comms.h48
-rw-r--r--sys/xen/xenbus/xenbus_if.m14
-rw-r--r--sys/xen/xenbus/xenbus_probe.c602
-rw-r--r--sys/xen/xenbus/xenbus_probe_backend.c308
-rw-r--r--sys/xen/xenbus/xenbus_xs.c935
-rw-r--r--sys/xen/xenbus/xenbusb.c878
-rw-r--r--sys/xen/xenbus/xenbusb.h272
-rw-r--r--sys/xen/xenbus/xenbusb_back.c295
-rw-r--r--sys/xen/xenbus/xenbusb_front.c195
-rw-r--r--sys/xen/xenbus/xenbusb_if.m78
-rw-r--r--sys/xen/xenbus/xenbusvar.h325
-rw-r--r--sys/xen/xenstore/xenstore.c1654
-rw-r--r--sys/xen/xenstore/xenstore_dev.c (renamed from sys/xen/xenbus/xenbus_dev.c)91
-rw-r--r--sys/xen/xenstore/xenstore_internal.h39
-rw-r--r--sys/xen/xenstore/xenstorevar.h338
78 files changed, 8974 insertions, 4324 deletions
diff --git a/sys/boot/common/gpt.c b/sys/boot/common/gpt.c
index 62e86dd..9c00980 100644
--- a/sys/boot/common/gpt.c
+++ b/sys/boot/common/gpt.c
@@ -49,7 +49,7 @@ static int curent, bootonce;
/*
* Buffer below 64kB passed on gptread(), which can hold at least
- * one sector od data (512 bytes).
+ * one sector of data (512 bytes).
*/
static char *secbuf;
@@ -62,7 +62,7 @@ gptupdate(const char *which, struct dsk *dskp, struct gpt_hdr *hdr,
/*
* We need to update the following for both primary and backup GPT:
- * 1. Sector on disk that contains curent partition.
+ * 1. Sector on disk that contains current partition.
* 2. Partition table checksum.
* 3. Header checksum.
* 4. Header on disk.
diff --git a/sys/boot/i386/boot2/boot2.c b/sys/boot/i386/boot2/boot2.c
index f521fd7..307d4c5 100644
--- a/sys/boot/i386/boot2/boot2.c
+++ b/sys/boot/i386/boot2/boot2.c
@@ -348,7 +348,7 @@ load(void)
return;
p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
bootinfo.bi_symtab = VTOP(p);
- memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
+ *(uint32_t*)p = hdr.ex.a_syms;
p += sizeof(hdr.ex.a_syms);
if (hdr.ex.a_syms) {
if (xfsread(ino, p, hdr.ex.a_syms))
@@ -385,7 +385,7 @@ load(void)
if (xfsread(ino, &es, sizeof(es)))
return;
for (i = 0; i < 2; i++) {
- memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
+ *(Elf32_Word *)p = es[i].sh_size;
p += sizeof(es[i].sh_size);
fs_off = es[i].sh_offset;
if (xfsread(ino, p, es[i].sh_size))
diff --git a/sys/conf/files b/sys/conf/files
index 75f16e5..74f25c1 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3008,19 +3008,20 @@ xen/gnttab.c optional xen | xenhvm
xen/features.c optional xen | xenhvm
xen/evtchn/evtchn.c optional xen
xen/evtchn/evtchn_dev.c optional xen | xenhvm
-xen/reboot.c optional xen
-xen/xenbus/xenbus_client.c optional xen | xenhvm
-xen/xenbus/xenbus_comms.c optional xen | xenhvm
-xen/xenbus/xenbus_dev.c optional xen | xenhvm
xen/xenbus/xenbus_if.m optional xen | xenhvm
-xen/xenbus/xenbus_probe.c optional xen | xenhvm
-#xen/xenbus/xenbus_probe_backend.c optional xen
-xen/xenbus/xenbus_xs.c optional xen | xenhvm
+xen/xenbus/xenbus.c optional xen | xenhvm
+xen/xenbus/xenbusb_if.m optional xen | xenhvm
+xen/xenbus/xenbusb.c optional xen | xenhvm
+xen/xenbus/xenbusb_front.c optional xen | xenhvm
+xen/xenbus/xenbusb_back.c optional xen | xenhvm
+xen/xenstore/xenstore.c optional xen | xenhvm
+xen/xenstore/xenstore_dev.c optional xen | xenhvm
dev/xen/balloon/balloon.c optional xen | xenhvm
+dev/xen/blkfront/blkfront.c optional xen | xenhvm
+dev/xen/blkback/blkback.c optional xen | xenhvm
dev/xen/console/console.c optional xen
dev/xen/console/xencons_ring.c optional xen
-dev/xen/blkfront/blkfront.c optional xen | xenhvm
+dev/xen/control/control.c optional xen | xenhvm
dev/xen/netfront/netfront.c optional xen | xenhvm
dev/xen/xenpci/xenpci.c optional xenpci
dev/xen/xenpci/evtchn.c optional xenpci
-dev/xen/xenpci/machine_reboot.c optional xenpci
diff --git a/sys/dev/acpica/acpi_pci.c b/sys/dev/acpica/acpi_pci.c
index bf7cf2e..a14e0ba 100644
--- a/sys/dev/acpica/acpi_pci.c
+++ b/sys/dev/acpica/acpi_pci.c
@@ -179,7 +179,7 @@ acpi_pci_set_powerstate_method(device_t dev, device_t child, int state)
*/
ACPI_SERIAL_BEGIN(pci_powerstate);
old_state = pci_get_powerstate(child);
- if (old_state < state) {
+ if (old_state < state && pci_do_power_suspend) {
error = pci_set_powerstate_method(dev, child, state);
if (error)
goto out;
diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index 662713f..95de34c 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -374,6 +374,7 @@ static void bge_tick(void *);
static void bge_stats_clear_regs(struct bge_softc *);
static void bge_stats_update(struct bge_softc *);
static void bge_stats_update_regs(struct bge_softc *);
+static struct mbuf *bge_check_short_dma(struct mbuf *);
static struct mbuf *bge_setup_tso(struct bge_softc *, struct mbuf *,
uint16_t *);
static int bge_encap(struct bge_softc *, struct mbuf **, uint32_t *);
@@ -1692,6 +1693,11 @@ bge_blockinit(struct bge_softc *sc)
bge_writembx(sc, BGE_MBX_RX_MINI_PROD_LO, 0);
}
+ /* Choose de-pipeline mode for BCM5906 A1. */
+ if (sc->bge_asicrev == BGE_ASICREV_BCM5906 &&
+ sc->bge_chiprev == BGE_CHIPID_BCM5906_A1)
+ CSR_WRITE_4(sc, BGE_ISO_PKT_TX,
+ (CSR_READ_4(sc, BGE_ISO_PKT_TX) & ~3) | 2);
/*
* The BD ring replenish thresholds control how often the
* hardware fetches new BD's from the producer rings in host
@@ -2633,6 +2639,8 @@ bge_attach(device_t dev)
case BGE_ASICREV_BCM5752:
case BGE_ASICREV_BCM5906:
sc->bge_flags |= BGE_FLAG_575X_PLUS;
+ if (sc->bge_asicrev == BGE_ASICREV_BCM5906)
+ sc->bge_flags |= BGE_FLAG_SHORT_DMA_BUG;
/* FALLTHROUGH */
case BGE_ASICREV_BCM5705:
sc->bge_flags |= BGE_FLAG_5705_PLUS;
@@ -4060,6 +4068,39 @@ bge_cksum_pad(struct mbuf *m)
}
static struct mbuf *
+bge_check_short_dma(struct mbuf *m)
+{
+ struct mbuf *n;
+ int found;
+
+ /*
+ * If device receive two back-to-back send BDs with less than
+ * or equal to 8 total bytes then the device may hang. The two
+ * back-to-back send BDs must in the same frame for this failure
+ * to occur. Scan mbuf chains and see whether two back-to-back
+ * send BDs are there. If this is the case, allocate new mbuf
+ * and copy the frame to workaround the silicon bug.
+ */
+ for (n = m, found = 0; n != NULL; n = n->m_next) {
+ if (n->m_len < 8) {
+ found++;
+ if (found > 1)
+ break;
+ continue;
+ }
+ found = 0;
+ }
+
+ if (found > 1) {
+ n = m_defrag(m, M_DONTWAIT);
+ if (n == NULL)
+ m_freem(m);
+ } else
+ n = m;
+ return (n);
+}
+
+static struct mbuf *
bge_setup_tso(struct bge_softc *sc, struct mbuf *m, uint16_t *mss)
{
struct ip *ip;
@@ -4132,6 +4173,13 @@ bge_encap(struct bge_softc *sc, struct mbuf **m_head, uint32_t *txidx)
csum_flags = 0;
mss = 0;
vlan_tag = 0;
+ if ((sc->bge_flags & BGE_FLAG_SHORT_DMA_BUG) != 0 &&
+ m->m_next != NULL) {
+ *m_head = bge_check_short_dma(m);
+ if (*m_head == NULL)
+ return (ENOBUFS);
+ m = *m_head;
+ }
if ((m->m_pkthdr.csum_flags & CSUM_TSO) != 0) {
*m_head = m = bge_setup_tso(sc, m, &mss);
if (*m_head == NULL)
@@ -4366,6 +4414,7 @@ bge_init_locked(struct bge_softc *sc)
{
struct ifnet *ifp;
uint16_t *m;
+ uint32_t mode;
BGE_LOCK_ASSERT(sc);
@@ -4471,8 +4520,12 @@ bge_init_locked(struct bge_softc *sc)
/* Init TX ring. */
bge_init_tx_ring(sc);
+ /* Enable TX MAC state machine lockup fix. */
+ mode = CSR_READ_4(sc, BGE_TX_MODE);
+ if (BGE_IS_5755_PLUS(sc) || sc->bge_asicrev == BGE_ASICREV_BCM5906)
+ mode |= BGE_TXMODE_MBUF_LOCKUP_FIX;
/* Turn on transmitter. */
- BGE_SETBIT(sc, BGE_TX_MODE, BGE_TXMODE_ENABLE);
+ CSR_WRITE_4(sc, BGE_TX_MODE, mode | BGE_TXMODE_ENABLE);
/* Turn on receiver. */
BGE_SETBIT(sc, BGE_RX_MODE, BGE_RXMODE_ENABLE);
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 60a39f8..a4f3f83 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -765,6 +765,7 @@
#define BGE_TXMODE_FLOWCTL_ENABLE 0x00000010
#define BGE_TXMODE_BIGBACKOFF_ENABLE 0x00000020
#define BGE_TXMODE_LONGPAUSE_ENABLE 0x00000040
+#define BGE_TXMODE_MBUF_LOCKUP_FIX 0x00000100
/* Transmit MAC status register */
#define BGE_TXSTAT_RX_XOFFED 0x00000001
@@ -879,6 +880,7 @@
#define BGE_SDI_STATS_CTL 0x0C08
#define BGE_SDI_STATS_ENABLE_MASK 0x0C0C
#define BGE_SDI_STATS_INCREMENT_MASK 0x0C10
+#define BGE_ISO_PKT_TX 0x0C20
#define BGE_LOCSTATS_COS0 0x0C80
#define BGE_LOCSTATS_COS1 0x0C84
#define BGE_LOCSTATS_COS2 0x0C88
@@ -2727,6 +2729,7 @@ struct bge_softc {
#define BGE_FLAG_40BIT_BUG 0x01000000
#define BGE_FLAG_4G_BNDRY_BUG 0x02000000
#define BGE_FLAG_RX_ALIGNBUG 0x04000000
+#define BGE_FLAG_SHORT_DMA_BUG 0x08000000
uint32_t bge_phy_flags;
#define BGE_PHY_WIRESPEED 0x00000001
#define BGE_PHY_ADC_BUG 0x00000002
diff --git a/sys/dev/iwi/if_iwi.c b/sys/dev/iwi/if_iwi.c
index f5ba34f..62b53be 100644
--- a/sys/dev/iwi/if_iwi.c
+++ b/sys/dev/iwi/if_iwi.c
@@ -1356,7 +1356,7 @@ iwi_checkforqos(struct ieee80211vap *vap,
wme = NULL;
while (frm < efrm) {
- IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], return);
+ IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], break);
switch (*frm) {
case IEEE80211_ELEMID_VENDOR:
if (iswmeoui(frm))
@@ -1368,7 +1368,7 @@ iwi_checkforqos(struct ieee80211vap *vap,
ni = vap->iv_bss;
ni->ni_capinfo = capinfo;
- ni->ni_associd = associd;
+ ni->ni_associd = associd & 0x3fff;
if (wme != NULL)
ni->ni_flags |= IEEE80211_NODE_QOS;
else
diff --git a/sys/dev/mfi/mfireg.h b/sys/dev/mfi/mfireg.h
index 17ab4b3..e08a16d 100644
--- a/sys/dev/mfi/mfireg.h
+++ b/sys/dev/mfi/mfireg.h
@@ -975,7 +975,9 @@ enum mfi_pd_state {
MFI_PD_STATE_OFFLINE = 0x10,
MFI_PD_STATE_FAILED = 0x11,
MFI_PD_STATE_REBUILD = 0x14,
- MFI_PD_STATE_ONLINE = 0x18
+ MFI_PD_STATE_ONLINE = 0x18,
+ MFI_PD_STATE_COPYBACK = 0x20,
+ MFI_PD_STATE_SYSTEM = 0x40
};
union mfi_ld_ref {
diff --git a/sys/dev/mvs/mvs.c b/sys/dev/mvs/mvs.c
index 11a8853..9968938 100644
--- a/sys/dev/mvs/mvs.c
+++ b/sys/dev/mvs/mvs.c
@@ -57,7 +57,8 @@ static int mvs_ch_deinit(device_t dev);
static int mvs_ch_suspend(device_t dev);
static int mvs_ch_resume(device_t dev);
static void mvs_dmainit(device_t dev);
-static void mvs_dmasetupc_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error);
+static void mvs_dmasetupc_cb(void *xsc,
+ bus_dma_segment_t *segs, int nsegs, int error);
static void mvs_dmafini(device_t dev);
static void mvs_slotsalloc(device_t dev);
static void mvs_slotsfree(device_t dev);
@@ -79,7 +80,8 @@ static void mvs_crbq_intr(device_t dev);
static void mvs_begin_transaction(device_t dev, union ccb *ccb);
static void mvs_legacy_execute_transaction(struct mvs_slot *slot);
static void mvs_timeout(struct mvs_slot *slot);
-static void mvs_dmasetprd(void *arg, bus_dma_segment_t *segs, int nsegs, int error);
+static void mvs_dmasetprd(void *arg,
+ bus_dma_segment_t *segs, int nsegs, int error);
static void mvs_requeue_frozen(device_t dev);
static void mvs_execute_transaction(struct mvs_slot *slot);
static void mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et);
@@ -314,9 +316,11 @@ mvs_dmainit(device_t dev)
if (bus_dmamem_alloc(ch->dma.workrq_tag, (void **)&ch->dma.workrq, 0,
&ch->dma.workrq_map))
goto error;
- if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map, ch->dma.workrq,
- MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) {
- bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map);
+ if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map,
+ ch->dma.workrq, MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) ||
+ dcba.error) {
+ bus_dmamem_free(ch->dma.workrq_tag,
+ ch->dma.workrq, ch->dma.workrq_map);
goto error;
}
ch->dma.workrq_bus = dcba.maddr;
@@ -329,9 +333,11 @@ mvs_dmainit(device_t dev)
if (bus_dmamem_alloc(ch->dma.workrp_tag, (void **)&ch->dma.workrp, 0,
&ch->dma.workrp_map))
goto error;
- if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map, ch->dma.workrp,
- MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) {
- bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map);
+ if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map,
+ ch->dma.workrp, MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) ||
+ dcba.error) {
+ bus_dmamem_free(ch->dma.workrp_tag,
+ ch->dma.workrp, ch->dma.workrp_map);
goto error;
}
ch->dma.workrp_bus = dcba.maddr;
@@ -371,7 +377,8 @@ mvs_dmafini(device_t dev)
}
if (ch->dma.workrp_bus) {
bus_dmamap_unload(ch->dma.workrp_tag, ch->dma.workrp_map);
- bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map);
+ bus_dmamem_free(ch->dma.workrp_tag,
+ ch->dma.workrp, ch->dma.workrp_map);
ch->dma.workrp_bus = 0;
ch->dma.workrp_map = NULL;
ch->dma.workrp = NULL;
@@ -382,7 +389,8 @@ mvs_dmafini(device_t dev)
}
if (ch->dma.workrq_bus) {
bus_dmamap_unload(ch->dma.workrq_tag, ch->dma.workrq_map);
- bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map);
+ bus_dmamem_free(ch->dma.workrq_tag,
+ ch->dma.workrq, ch->dma.workrq_map);
ch->dma.workrq_bus = 0;
ch->dma.workrq_map = NULL;
ch->dma.workrq = NULL;
@@ -444,14 +452,16 @@ mvs_setup_edma_queues(device_t dev)
ATA_OUTL(ch->r_mem, EDMA_REQQBAH, work >> 32);
ATA_OUTL(ch->r_mem, EDMA_REQQIP, work & 0xffffffff);
ATA_OUTL(ch->r_mem, EDMA_REQQOP, work & 0xffffffff);
- bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map, BUS_DMASYNC_PREWRITE);
+ bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
+ BUS_DMASYNC_PREWRITE);
/* Reponses queue. */
- bzero(ch->dma.workrp, 256);
+ memset(ch->dma.workrp, 0xff, MVS_WORKRP_SIZE);
work = ch->dma.workrp_bus;
ATA_OUTL(ch->r_mem, EDMA_RESQBAH, work >> 32);
ATA_OUTL(ch->r_mem, EDMA_RESQIP, work & 0xffffffff);
ATA_OUTL(ch->r_mem, EDMA_RESQOP, work & 0xffffffff);
- bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, BUS_DMASYNC_PREREAD);
+ bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
+ BUS_DMASYNC_PREREAD);
ch->out_idx = 0;
ch->in_idx = 0;
}
@@ -678,20 +688,15 @@ mvs_ch_intr(void *data)
int i, ccs, port = -1, selfdis = 0;
int edma = (ch->numtslots != 0 || ch->numdslots != 0);
-//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n",
-// arg->cause, edma, ATA_INL(ch->r_mem, EDMA_IEC));
/* New item in response queue. */
if ((arg->cause & 2) && edma)
mvs_crbq_intr(dev);
/* Some error or special event. */
if (arg->cause & 1) {
iec = ATA_INL(ch->r_mem, EDMA_IEC);
-//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n",
-// arg->cause, edma, iec);
if (iec & EDMA_IE_SERRINT) {
serr = ATA_INL(ch->r_mem, SATA_SE);
ATA_OUTL(ch->r_mem, SATA_SE, serr);
-//device_printf(dev, "SERR %08x\n", serr);
}
/* EDMA self-disabled due to error. */
if (iec & EDMA_IE_ESELFDIS)
@@ -706,7 +711,6 @@ mvs_ch_intr(void *data)
fisic = SATA_FISC_FISWAIT4HOSTRDYEN_B1;
else /* For Gen-IIe - read FIS interrupt cause. */
fisic = ATA_INL(ch->r_mem, SATA_FISIC);
-//device_printf(dev, "FISIC %08x\n", fisic);
}
if (selfdis)
ch->curr_mode = MVS_EDMA_UNKNOWN;
@@ -745,7 +749,6 @@ mvs_ch_intr(void *data)
}
}
}
-//device_printf(dev, "err slot %d port %d\n", ccs, port);
mvs_requeue_frozen(dev);
for (i = 0; i < MVS_MAX_SLOTS; i++) {
/* XXX: reqests in loading state. */
@@ -771,7 +774,8 @@ mvs_ch_intr(void *data)
ch->fatalerr = 1;
}
} else if (iec & 0xfc1e9000) {
- if (ch->numtslots == 0 && i != ccs && port != -2)
+ if (ch->numtslots == 0 &&
+ i != ccs && port != -2)
et = MVS_ERR_INNOCENT;
else
et = MVS_ERR_SATA;
@@ -823,8 +827,6 @@ mvs_legacy_intr(device_t dev)
/* Clear interrupt and get status. */
status = mvs_getstatus(dev, 1);
-// device_printf(dev, "Legacy intr status %02x\n",
-// status);
if (slot->state < MVS_SLOT_RUNNING)
return;
port = ccb->ccb_h.target_id & 0x0f;
@@ -867,7 +869,8 @@ mvs_legacy_intr(device_t dev)
/* If data write command - put them */
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
- device_printf(dev, "timeout waiting for write DRQ\n");
+ device_printf(dev,
+ "timeout waiting for write DRQ\n");
et = MVS_ERR_TIMEOUT;
goto end_finished;
}
@@ -890,19 +893,18 @@ mvs_legacy_intr(device_t dev)
ATA_OUTL(ch->r_mem, DMA_C, 0);
goto end_finished;
} else { /* ATAPI PIO */
- length = ATA_INB(ch->r_mem,ATA_CYL_LSB) | (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8);
+ length = ATA_INB(ch->r_mem,ATA_CYL_LSB) |
+ (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8);
ireason = ATA_INB(ch->r_mem,ATA_IREASON);
-//device_printf(dev, "status %02x, ireason %02x, length %d\n", status, ireason, length);
switch ((ireason & (ATA_I_CMD | ATA_I_IN)) |
(status & ATA_S_DRQ)) {
case ATAPI_P_CMDOUT:
-device_printf(dev, "ATAPI CMDOUT\n");
+ device_printf(dev, "ATAPI CMDOUT\n");
/* Return wait for interrupt */
return;
case ATAPI_P_WRITE:
-//device_printf(dev, "ATAPI WRITE\n");
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
device_printf(dev, "trying to write on read buffer\n");
et = MVS_ERR_TFE;
@@ -920,7 +922,6 @@ device_printf(dev, "ATAPI CMDOUT\n");
return;
case ATAPI_P_READ:
-//device_printf(dev, "ATAPI READ\n");
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
device_printf(dev, "trying to read on write buffer\n");
et = MVS_ERR_TFE;
@@ -937,7 +938,6 @@ device_printf(dev, "ATAPI CMDOUT\n");
return;
case ATAPI_P_DONEDRQ:
-device_printf(dev, "ATAPI DONEDRQ\n");
device_printf(dev,
"WARNING - DONEDRQ non conformant device\n");
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
@@ -958,13 +958,13 @@ device_printf(dev, "ATAPI DONEDRQ\n");
case ATAPI_P_ABORT:
case ATAPI_P_DONE:
-//device_printf(dev, "ATAPI ABORT/DONE\n");
if (status & (ATA_S_ERROR | ATA_S_DWF))
et = MVS_ERR_TFE;
goto end_finished;
default:
- device_printf(dev, "unknown transfer phase (status %02x, ireason %02x)\n",
+ device_printf(dev, "unknown transfer phase"
+ " (status %02x, ireason %02x)\n",
status, ireason);
et = MVS_ERR_TFE;
}
@@ -980,38 +980,54 @@ mvs_crbq_intr(device_t dev)
struct mvs_channel *ch = device_get_softc(dev);
struct mvs_crpb *crpb;
union ccb *ccb;
- int in_idx, cin_idx, slot;
+ int in_idx, fin_idx, cin_idx, slot;
+ uint32_t val;
uint16_t flags;
- in_idx = (ATA_INL(ch->r_mem, EDMA_RESQIP) & EDMA_RESQP_ERPQP_MASK) >>
+ val = ATA_INL(ch->r_mem, EDMA_RESQIP);
+ if (val == 0)
+ val = ATA_INL(ch->r_mem, EDMA_RESQIP);
+ in_idx = (val & EDMA_RESQP_ERPQP_MASK) >>
EDMA_RESQP_ERPQP_SHIFT;
bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
BUS_DMASYNC_POSTREAD);
- cin_idx = ch->in_idx;
+ fin_idx = cin_idx = ch->in_idx;
ch->in_idx = in_idx;
while (in_idx != cin_idx) {
crpb = (struct mvs_crpb *)
- (ch->dma.workrp + MVS_CRPB_OFFSET + (MVS_CRPB_SIZE * cin_idx));
+ (ch->dma.workrp + MVS_CRPB_OFFSET +
+ (MVS_CRPB_SIZE * cin_idx));
slot = le16toh(crpb->id) & MVS_CRPB_TAG_MASK;
flags = le16toh(crpb->rspflg);
-//device_printf(dev, "CRPB %d %d %04x\n", cin_idx, slot, flags);
/*
* Handle only successfull completions here.
* Errors will be handled by main intr handler.
*/
- if (ch->numtslots != 0 || (flags & EDMA_IE_EDEVERR) == 0) {
-if ((flags >> 8) & ATA_S_ERROR)
-device_printf(dev, "ERROR STATUS CRPB %d %d %04x\n", cin_idx, slot, flags);
+ if (crpb->id == 0xffff && crpb->rspflg == 0xffff) {
+ device_printf(dev, "Unfilled CRPB "
+ "%d (%d->%d) tag %d flags %04x rs %08x\n",
+ cin_idx, fin_idx, in_idx, slot, flags, ch->rslots);
+ } else if (ch->numtslots != 0 ||
+ (flags & EDMA_IE_EDEVERR) == 0) {
+ crpb->id = 0xffff;
+ crpb->rspflg = 0xffff;
if (ch->slot[slot].state >= MVS_SLOT_RUNNING) {
ccb = ch->slot[slot].ccb;
- ccb->ataio.res.status = (flags & MVS_CRPB_ATASTS_MASK) >>
+ ccb->ataio.res.status =
+ (flags & MVS_CRPB_ATASTS_MASK) >>
MVS_CRPB_ATASTS_SHIFT;
mvs_end_transaction(&ch->slot[slot], MVS_ERR_NONE);
- } else
-device_printf(dev, "EMPTY CRPB %d (->%d) %d %04x\n", cin_idx, in_idx, slot, flags);
- } else
-device_printf(dev, "ERROR FLAGS CRPB %d %d %04x\n", cin_idx, slot, flags);
-
+ } else {
+ device_printf(dev, "Unused tag in CRPB "
+ "%d (%d->%d) tag %d flags %04x rs %08x\n",
+ cin_idx, fin_idx, in_idx, slot, flags,
+ ch->rslots);
+ }
+ } else {
+ device_printf(dev,
+ "CRPB with error %d tag %d flags %04x\n",
+ cin_idx, slot, flags);
+ }
cin_idx = (cin_idx + 1) & (MVS_MAX_SLOTS - 1);
}
bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
@@ -1266,8 +1282,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
ch->rslots |= (1 << slot->slot);
ATA_OUTB(ch->r_mem, SATA_SATAICTL, port << SATA_SATAICTL_PMPTX_SHIFT);
if (ccb->ccb_h.func_code == XPT_ATA_IO) {
-// device_printf(dev, "%d Legacy command %02x size %d\n",
-// port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len);
mvs_tfd_write(dev, ccb);
/* Device reset doesn't interrupt. */
if (ccb->ataio.cmd.command == ATA_DEVICE_RESET) {
@@ -1287,7 +1301,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
/* If data write command - output the data */
if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
- device_printf(dev, "timeout waiting for write DRQ\n");
+ device_printf(dev,
+ "timeout waiting for write DRQ\n");
mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
return;
}
@@ -1296,9 +1311,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
ch->transfersize / 2);
}
} else {
-// device_printf(dev, "%d ATAPI command %02x size %d dma %d\n",
-// port, ccb->csio.cdb_io.cdb_bytes[0], ccb->csio.dxfer_len,
-// ch->basic_dma);
ch->donecount = 0;
ch->transfersize = min(ccb->csio.dxfer_len,
ch->curr[port].bytecount);
@@ -1331,7 +1343,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
DELAY(20);
}
if (timeout <= 0) {
- device_printf(dev, "timeout waiting for ATAPI command ready\n");
+ device_printf(dev,
+ "timeout waiting for ATAPI command ready\n");
mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
return;
}
@@ -1371,8 +1384,6 @@ mvs_execute_transaction(struct mvs_slot *slot)
int port = ccb->ccb_h.target_id & 0x0f;
int i;
-// device_printf(dev, "%d EDMA command %02x size %d slot %d tag %d\n",
-// port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len, slot->slot, slot->tag);
/* Get address of the prepared EPRD */
eprd = ch->dma.workrq_bus + MVS_EPRD_OFFSET + (MVS_EPRD_SIZE * slot->slot);
/* Prepare CRQB. Gen IIe uses different CRQB format. */
@@ -1554,7 +1565,6 @@ mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et)
union ccb *ccb = slot->ccb;
int lastto;
-//device_printf(dev, "cmd done status %d\n", et);
bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
BUS_DMASYNC_POSTWRITE);
/* Read result registers to the result struct
@@ -1792,7 +1802,8 @@ mvs_process_read_log(device_t dev, union ccb *ccb)
if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
device_printf(dev, "Error while READ LOG EXT\n");
else if ((data[0] & 0x80) == 0) {
- device_printf(dev, "Non-queued command error in READ LOG EXT\n");
+ device_printf(dev,
+ "Non-queued command error in READ LOG EXT\n");
}
for (i = 0; i < MVS_MAX_SLOTS; i++) {
if (!ch->hold[i])
diff --git a/sys/dev/mvs/mvs_pci.c b/sys/dev/mvs/mvs_pci.c
index 6b18e2c..e2e37da 100644
--- a/sys/dev/mvs/mvs_pci.c
+++ b/sys/dev/mvs/mvs_pci.c
@@ -339,7 +339,6 @@ mvs_intr(void *data)
u_int32_t ic, aic;
ic = ATA_INL(ctlr->r_mem, CHIP_MIC);
-//device_printf(ctlr->dev, "irq MIC:%08x\n", ic);
if (ctlr->msi) {
/* We have to to mask MSI during processing. */
mtx_lock(&ctlr->mtx);
diff --git a/sys/dev/mvs/mvs_soc.c b/sys/dev/mvs/mvs_soc.c
index 03029c2..ed861f2 100644
--- a/sys/dev/mvs/mvs_soc.c
+++ b/sys/dev/mvs/mvs_soc.c
@@ -295,7 +295,6 @@ mvs_intr(void *data)
u_int32_t ic, aic;
ic = ATA_INL(ctlr->r_mem, CHIP_SOC_MIC);
-//device_printf(ctlr->dev, "irq MIC:%08x\n", ic);
if ((ic & IC_HC0) == 0)
return;
/* Acknowledge interrupts of this HC. */
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index d1b211a..ef80f81 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -182,6 +182,7 @@ struct pci_quirk {
int type;
#define PCI_QUIRK_MAP_REG 1 /* PCI map register in weird place */
#define PCI_QUIRK_DISABLE_MSI 2 /* MSI/MSI-X doesn't work */
+#define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI works */
int arg1;
int arg2;
};
@@ -218,6 +219,12 @@ struct pci_quirk pci_quirks[] = {
*/
{ 0x74501022, PCI_QUIRK_DISABLE_MSI, 0, 0 },
+ /*
+ * Some virtualization environments emulate an older chipset
+ * but support MSI just fine. QEMU uses the Intel 82440.
+ */
+ { 0x12378086, PCI_QUIRK_ENABLE_MSI_VM, 0, 0 },
+
{ 0 }
};
@@ -257,6 +264,12 @@ SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RW,
&pci_do_power_resume, 1,
"Transition from D3 -> D0 on resume.");
+int pci_do_power_suspend = 1;
+TUNABLE_INT("hw.pci.do_power_suspend", &pci_do_power_suspend);
+SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RW,
+ &pci_do_power_suspend, 1,
+ "Transition from D0 -> D3 on suspend.");
+
static int pci_do_msi = 1;
TUNABLE_INT("hw.pci.enable_msi", &pci_do_msi);
SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RW, &pci_do_msi, 1,
@@ -594,7 +607,7 @@ pci_read_extcap(device_t pcib, pcicfgregs *cfg)
if (cfg->pp.pp_cap == 0) {
cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2);
cfg->pp.pp_status = ptr + PCIR_POWER_STATUS;
- cfg->pp.pp_pmcsr = ptr + PCIR_POWER_PMCSR;
+ cfg->pp.pp_bse = ptr + PCIR_POWER_BSE;
if ((nextptr - ptr) > PCIR_POWER_DATA)
cfg->pp.pp_data = ptr + PCIR_POWER_DATA;
}
@@ -1828,6 +1841,23 @@ pci_msi_device_blacklisted(device_t dev)
}
/*
+ * Returns true if a specified chipset supports MSI when it is
+ * emulated hardware in a virtual machine.
+ */
+static int
+pci_msi_vm_chipset(device_t dev)
+{
+ struct pci_quirk *q;
+
+ for (q = &pci_quirks[0]; q->devid; q++) {
+ if (q->devid == pci_get_devid(dev) &&
+ q->type == PCI_QUIRK_ENABLE_MSI_VM)
+ return (1);
+ }
+ return (0);
+}
+
+/*
* Determine if MSI is blacklisted globally on this sytem. Currently,
* we just check for blacklisted chipsets as represented by the
* host-PCI bridge at device 0:0:0. In the future, it may become
@@ -1843,8 +1873,14 @@ pci_msi_blacklisted(void)
return (0);
/* Blacklist all non-PCI-express and non-PCI-X chipsets. */
- if (!(pcie_chipset || pcix_chipset))
+ if (!(pcie_chipset || pcix_chipset)) {
+ if (vm_guest != VM_GUEST_NO) {
+ dev = pci_find_bsf(0, 0, 0);
+ if (dev != NULL)
+ return (pci_msi_vm_chipset(dev) == 0);
+ }
return (1);
+ }
dev = pci_find_bsf(0, 0, 0);
if (dev != NULL)
@@ -2954,7 +2990,9 @@ pci_suspend(device_t dev)
free(devlist, M_TEMP);
return (error);
}
- pci_set_power_children(dev, devlist, numdevs, PCI_POWERSTATE_D3);
+ if (pci_do_power_suspend)
+ pci_set_power_children(dev, devlist, numdevs,
+ PCI_POWERSTATE_D3);
free(devlist, M_TEMP);
return (0);
}
@@ -3656,9 +3694,15 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid,
res = NULL;
pci_read_bar(child, *rid, &map, &testval);
- /* Ignore a BAR with a base of 0. */
- if ((*rid == PCIR_BIOS && pci_rombase(testval) == 0) ||
- pci_mapbase(testval) == 0)
+ /*
+ * Determine the size of the BAR and ignore BARs with a size
+ * of 0. Device ROM BARs use a different mask value.
+ */
+ if (*rid == PCIR_BIOS)
+ mapsize = pci_romsize(testval);
+ else
+ mapsize = pci_mapsize(testval);
+ if (mapsize == 0)
goto out;
if (PCI_BAR_MEM(testval) || *rid == PCIR_BIOS) {
@@ -3687,13 +3731,7 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid,
* actually uses and we would otherwise have a
* situation where we might allocate the excess to
* another driver, which won't work.
- *
- * Device ROM BARs use a different mask value.
*/
- if (*rid == PCIR_BIOS)
- mapsize = pci_romsize(testval);
- else
- mapsize = pci_mapsize(testval);
count = 1UL << mapsize;
if (RF_ALIGNMENT(flags) < mapsize)
flags = (flags & ~RF_ALIGNMENT_MASK) | RF_ALIGNMENT_LOG2(mapsize);
diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c
index 9992b81..7915818 100644
--- a/sys/dev/pci/pci_pci.c
+++ b/sys/dev/pci/pci_pci.c
@@ -447,7 +447,7 @@ pcib_suspend(device_t dev)
pcib_cfg_save(device_get_softc(dev));
error = bus_generic_suspend(dev);
- if (error == 0 && pci_do_power_resume) {
+ if (error == 0 && pci_do_power_suspend) {
dstate = PCI_POWERSTATE_D3;
pcib = device_get_parent(device_get_parent(dev));
if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0)
diff --git a/sys/dev/pci/pci_private.h b/sys/dev/pci/pci_private.h
index 70d887b..90866ef 100644
--- a/sys/dev/pci/pci_private.h
+++ b/sys/dev/pci/pci_private.h
@@ -39,6 +39,7 @@
DECLARE_CLASS(pci_driver);
extern int pci_do_power_resume;
+extern int pci_do_power_suspend;
void pci_add_children(device_t dev, int domain, int busno,
size_t dinfo_size);
diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h
index a0d12db..02fa7ea 100644
--- a/sys/dev/pci/pcireg.h
+++ b/sys/dev/pci/pcireg.h
@@ -427,12 +427,16 @@
#define PCIR_POWER_CAP 0x2
#define PCIM_PCAP_SPEC 0x0007
#define PCIM_PCAP_PMEREQCLK 0x0008
-#define PCIM_PCAP_PMEREQPWR 0x0010
#define PCIM_PCAP_DEVSPECINIT 0x0020
-#define PCIM_PCAP_DYNCLOCK 0x0040
-#define PCIM_PCAP_SECCLOCK 0x00c0
-#define PCIM_PCAP_CLOCKMASK 0x00c0
-#define PCIM_PCAP_REQFULLCLOCK 0x0100
+#define PCIM_PCAP_AUXPWR_0 0x0000
+#define PCIM_PCAP_AUXPWR_55 0x0040
+#define PCIM_PCAP_AUXPWR_100 0x0080
+#define PCIM_PCAP_AUXPWR_160 0x00c0
+#define PCIM_PCAP_AUXPWR_220 0x0100
+#define PCIM_PCAP_AUXPWR_270 0x0140
+#define PCIM_PCAP_AUXPWR_320 0x0180
+#define PCIM_PCAP_AUXPWR_375 0x01c0
+#define PCIM_PCAP_AUXPWRMASK 0x01c0
#define PCIM_PCAP_D1SUPP 0x0200
#define PCIM_PCAP_D2SUPP 0x0400
#define PCIM_PCAP_D0PME 0x0800
@@ -447,16 +451,17 @@
#define PCIM_PSTAT_D2 0x0002
#define PCIM_PSTAT_D3 0x0003
#define PCIM_PSTAT_DMASK 0x0003
-#define PCIM_PSTAT_REPENABLE 0x0010
+#define PCIM_PSTAT_NOSOFTRESET 0x0008
#define PCIM_PSTAT_PMEENABLE 0x0100
#define PCIM_PSTAT_D0POWER 0x0000
#define PCIM_PSTAT_D1POWER 0x0200
#define PCIM_PSTAT_D2POWER 0x0400
#define PCIM_PSTAT_D3POWER 0x0600
#define PCIM_PSTAT_D0HEAT 0x0800
-#define PCIM_PSTAT_D1HEAT 0x1000
-#define PCIM_PSTAT_D2HEAT 0x1200
-#define PCIM_PSTAT_D3HEAT 0x1400
+#define PCIM_PSTAT_D1HEAT 0x0a00
+#define PCIM_PSTAT_D2HEAT 0x0c00
+#define PCIM_PSTAT_D3HEAT 0x0e00
+#define PCIM_PSTAT_DATASELMASK 0x1e00
#define PCIM_PSTAT_DATAUNKN 0x0000
#define PCIM_PSTAT_DATADIV10 0x2000
#define PCIM_PSTAT_DATADIV100 0x4000
@@ -464,11 +469,10 @@
#define PCIM_PSTAT_DATADIVMASK 0x6000
#define PCIM_PSTAT_PME 0x8000
-#define PCIR_POWER_PMCSR 0x6
-#define PCIM_PMCSR_DCLOCK 0x10
-#define PCIM_PMCSR_B2SUPP 0x20
-#define PCIM_BMCSR_B3SUPP 0x40
-#define PCIM_BMCSR_BPCE 0x80
+#define PCIR_POWER_BSE 0x6
+#define PCIM_PMCSR_BSE_D3B3 0x00
+#define PCIM_PMCSR_BSE_D3B2 0x40
+#define PCIM_PMCSR_BSE_BPCCE 0x80
#define PCIR_POWER_DATA 0x7
diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h
index d6a2a0e..aee967a 100644
--- a/sys/dev/pci/pcivar.h
+++ b/sys/dev/pci/pcivar.h
@@ -42,9 +42,9 @@ typedef uint64_t pci_addr_t;
/* Interesting values for PCI power management */
struct pcicfg_pp {
uint16_t pp_cap; /* PCI power management capabilities */
- uint8_t pp_status; /* config space address of PCI power status reg */
- uint8_t pp_pmcsr; /* config space address of PMCSR reg */
- uint8_t pp_data; /* config space address of PCI power data reg */
+ uint8_t pp_status; /* conf. space addr. of PM control/status reg */
+ uint8_t pp_bse; /* conf. space addr. of PM BSE reg */
+ uint8_t pp_data; /* conf. space addr. of PM data reg */
};
struct vpd_readonly {
diff --git a/sys/dev/sis/if_sis.c b/sys/dev/sis/if_sis.c
index 93246d3..0957419 100644
--- a/sys/dev/sis/if_sis.c
+++ b/sys/dev/sis/if_sis.c
@@ -1795,12 +1795,15 @@ sis_intr(void *arg)
if ((status & SIS_INTRS) == 0) {
/* Not ours. */
SIS_UNLOCK(sc);
+ return;
}
/* Disable interrupts. */
CSR_WRITE_4(sc, SIS_IER, 0);
for (;(status & SIS_INTRS) != 0;) {
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ break;
if (status &
(SIS_ISR_TX_DESC_OK | SIS_ISR_TX_ERR |
SIS_ISR_TX_OK | SIS_ISR_TX_IDLE) )
@@ -1825,11 +1828,13 @@ sis_intr(void *arg)
status = CSR_READ_4(sc, SIS_ISR);
}
- /* Re-enable interrupts. */
- CSR_WRITE_4(sc, SIS_IER, 1);
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ /* Re-enable interrupts. */
+ CSR_WRITE_4(sc, SIS_IER, 1);
- if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
- sis_startl(ifp);
+ if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+ sis_startl(ifp);
+ }
SIS_UNLOCK(sc);
}
diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c
index 6948173..eb55dfc 100644
--- a/sys/dev/xen/balloon/balloon.c
+++ b/sys/dev/xen/balloon/balloon.c
@@ -44,7 +44,7 @@ __FBSDID("$FreeBSD$");
#include <machine/xen/xenfunc.h>
#include <machine/xen/xenvar.h>
#include <xen/hypervisor.h>
-#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenstore/xenstorevar.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
@@ -406,20 +406,20 @@ set_new_target(unsigned long target)
wakeup(balloon_process);
}
-static struct xenbus_watch target_watch =
+static struct xs_watch target_watch =
{
.node = "memory/target"
};
/* React to a change in the target key */
static void
-watch_target(struct xenbus_watch *watch,
+watch_target(struct xs_watch *watch,
const char **vec, unsigned int len)
{
unsigned long long new_target;
int err;
- err = xenbus_scanf(XBT_NIL, "memory", "target", NULL,
+ err = xs_scanf(XST_NIL, "memory", "target", NULL,
"%llu", &new_target);
if (err) {
/* This is ok (for domain0 at least) - so just return */
@@ -438,7 +438,7 @@ balloon_init_watcher(void *arg)
{
int err;
- err = register_xenbus_watch(&target_watch);
+ err = xs_register_watch(&target_watch);
if (err)
printf("Failed to set balloon watcher\n");
diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
index 259f2f6..72087f5 100644
--- a/sys/dev/xen/blkback/blkback.c
+++ b/sys/dev/xen/blkback/blkback.c
@@ -1,1055 +1,1919 @@
-/*
- * Copyright (c) 2006, Cisco Systems, Inc.
+/*-
+ * Copyright (c) 2009-2010 Spectra Logic Corporation
* All rights reserved.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
* are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * substantially similar to the "NO WARRANTY" disclaimer below
+ * ("Disclaimer") and any redistribution must be conditioned upon
+ * including a substantially similar Disclaimer requirement for further
+ * binary redistribution.
*
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
*
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * Authors: Justin T. Gibbs (Spectra Logic Corporation)
+ * Ken Merry (Spectra Logic Corporation)
*/
-
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+/**
+ * \file blkback.c
+ *
+ * \brief Device driver supporting the vending of block storage from
+ * a FreeBSD domain to other domains.
+ */
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
-#include <sys/malloc.h>
#include <sys/kernel.h>
-#include <sys/socket.h>
-#include <sys/queue.h>
-#include <sys/taskqueue.h>
+#include <sys/malloc.h>
+
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/devicestat.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/kdb.h>
+#include <sys/module.h>
#include <sys/namei.h>
#include <sys/proc.h>
-#include <sys/filedesc.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/types.h>
#include <sys/vnode.h>
-#include <sys/fcntl.h>
-#include <sys/disk.h>
-#include <sys/bio.h>
-
-#include <sys/module.h>
-#include <sys/bus.h>
-#include <sys/sysctl.h>
+#include <sys/mount.h>
#include <geom/geom.h>
+#include <machine/_inttypes.h>
+#include <machine/xen/xen-os.h>
+
+#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
-#include <machine/xen-os.h>
-#include <machine/hypervisor.h>
-#include <machine/hypervisor-ifs.h>
-#include <machine/xen_intr.h>
-#include <machine/evtchn.h>
-#include <machine/xenbus.h>
-#include <machine/gnttab.h>
-#include <machine/xen-public/memory.h>
-#include <dev/xen/xenbus/xenbus_comms.h>
+#include <xen/blkif.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
-#if XEN_BLKBACK_DEBUG
-#define DPRINTF(fmt, args...) \
- printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTF(fmt, args...) ((void)0)
-#endif
-
-#define WPRINTF(fmt, args...) \
- printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+#include <xen/xenbus/xenbusvar.h>
-#define BLKBACK_INVALID_HANDLE (~0)
+/*--------------------------- Compile-time Tunables --------------------------*/
+/**
+ * The maximum number of outstanding request blocks (request headers plus
+ * additional segment blocks) we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBB_MAX_REQUESTS 256
-struct ring_ref {
- vm_offset_t va;
- grant_handle_t handle;
- uint64_t bus_addr;
-};
+/**
+ * \brief Define to force all I/O to be performed on memory owned by the
+ * backend device, with a copy-in/out to the remote domain's memory.
+ *
+ * \note This option is currently required when this driver's domain is
+ * operating in HVM mode on a system using an IOMMU.
+ *
+ * This driver uses Xen's grant table API to gain access to the memory of
+ * the remote domains it serves. When our domain is operating in PV mode,
+ * the grant table mechanism directly updates our domain's page table entries
+ * to point to the physical pages of the remote domain. This scheme guarantees
+ * that blkback and the backing devices it uses can safely perform DMA
+ * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to
+ * insure that our domain cannot DMA to pages owned by another domain. As
+ * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
+ * table API. For this reason, in HVM mode, we must bounce all requests into
+ * memory that is mapped into our domain at domain startup and thus has
+ * valid IOMMU mappings.
+ */
+#define XBB_USE_BOUNCE_BUFFERS
-typedef struct blkback_info {
+/**
+ * \brief Define to enable rudimentary request logging to the console.
+ */
+#undef XBB_DEBUG
- /* Schedule lists */
- STAILQ_ENTRY(blkback_info) next_req;
- int on_req_sched_list;
+/*---------------------------------- Macros ----------------------------------*/
+/**
+ * Custom malloc type for all driver allocations.
+ */
+MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
- struct xenbus_device *xdev;
- XenbusState frontend_state;
+#ifdef XBB_DEBUG
+#define DPRINTF(fmt, args...) \
+ printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+#else
+#define DPRINTF(fmt, args...) do {} while(0)
+#endif
- domid_t domid;
+/**
+ * The maximum mapped region size per request we will allow in a negotiated
+ * block-front/back communication channel.
+ */
+#define XBB_MAX_REQUEST_SIZE \
+ MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
- int state;
- int ring_connected;
- struct ring_ref rr;
- blkif_back_ring_t ring;
- evtchn_port_t evtchn;
- int irq;
- void *irq_cookie;
+/**
+ * The maximum number of segments (within a request header and accompanying
+ * segment blocks) per request we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBB_MAX_SEGMENTS_PER_REQUEST \
+ (MIN(UIO_MAXIOV, \
+ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
+ (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
+
+/**
+ * The maximum number of shared memory ring pages we will allow in a
+ * negotiated block-front/back communication channel. Allow enough
+ * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
+ */
+#define XBB_MAX_RING_PAGES \
+ BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
+ * XBB_MAX_REQUESTS)
- int ref_cnt;
+/*--------------------------- Forward Declarations ---------------------------*/
+struct xbb_softc;
- int handle;
- char *mode;
- char *type;
- char *dev_name;
+static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
+ ...) __attribute__((format(printf, 3, 4)));
+static int xbb_shutdown(struct xbb_softc *xbb);
+static int xbb_detach(device_t dev);
- struct vnode *vn;
- struct cdev *cdev;
- struct cdevsw *csw;
- u_int sector_size;
- int sector_size_shift;
- off_t media_size;
- u_int media_num_sectors;
- int major;
- int minor;
- int read_only;
-
- struct mtx blk_ring_lock;
-
- device_t ndev;
-
- /* Stats */
- int st_rd_req;
- int st_wr_req;
- int st_oo_req;
- int st_err_req;
-} blkif_t;
-
-/*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
- *
- * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- *
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+/*------------------------------ Data Structures -----------------------------*/
+/**
+ * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
*/
-static int blkif_reqs = 64;
-TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs);
+struct xbb_xen_req {
+ /**
+ * Linked list links used to aggregate idle request in the
+ * request free pool (xbb->request_free_slist).
+ */
+ SLIST_ENTRY(xbb_xen_req) links;
+
+ /**
+ * Back reference to the parent block back instance for this
+ * request. Used during bio_done handling.
+ */
+ struct xbb_softc *xbb;
+
+ /**
+ * The remote domain's identifier for this I/O request.
+ */
+ uint64_t id;
+
+ /**
+ * Kernel virtual address space reserved for this request
+ * structure and used to map the remote domain's pages for
+ * this I/O, into our domain's address space.
+ */
+ uint8_t *kva;
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /**
+ * Pre-allocated domain local memory used to proxy remote
+ * domain memory during I/O operations.
+ */
+ uint8_t *bounce;
+#endif
-static int mmap_pages;
+ /**
+ * Base, psuedo-physical address, corresponding to the start
+ * of this request's kva region.
+ */
+ uint64_t gnt_base;
+
+ /**
+ * The number of pages currently mapped for this request.
+ */
+ int nr_pages;
+
+ /**
+ * The number of 512 byte sectors comprising this requests.
+ */
+ int nr_512b_sectors;
+
+ /**
+ * The number of struct bio requests still outstanding for this
+ * request on the backend device. This field is only used for
+ * device (rather than file) backed I/O.
+ */
+ int pendcnt;
+
+ /**
+ * BLKIF_OP code for this request.
+ */
+ int operation;
+
+ /**
+ * BLKIF_RSP status code for this request.
+ *
+ * This field allows an error status to be recorded even if the
+ * delivery of this status must be deferred. Deferred reporting
+ * is necessary, for example, when an error is detected during
+ * completion processing of one bio when other bios for this
+ * request are still outstanding.
+ */
+ int status;
+
+ /**
+ * Device statistics request ordering type (ordered or simple).
+ */
+ devstat_tag_type ds_tag_type;
+
+ /**
+ * Device statistics request type (read, write, no_data).
+ */
+ devstat_trans_flags ds_trans_type;
+
+ /**
+ * The start time for this request.
+ */
+ struct bintime ds_t0;
+
+ /**
+ * Array of grant handles (one per page) used to map this request.
+ */
+ grant_handle_t *gnt_handles;
+};
+SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
-/*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
+/**
+ * \brief Configuration data for the shared memory request ring
+ * used to communicate with the front-end client of this
+ * this driver.
*/
-typedef struct pending_req {
- blkif_t *blkif;
- uint64_t id;
- int nr_pages;
- int pendcnt;
- unsigned short operation;
- int status;
- STAILQ_ENTRY(pending_req) free_list;
-} pending_req_t;
-
-static pending_req_t *pending_reqs;
-static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free =
- STAILQ_HEAD_INITIALIZER(pending_free);
-static struct mtx pending_free_lock;
-
-static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list =
- STAILQ_HEAD_INITIALIZER(req_sched_list);
-static struct mtx req_sched_list_lock;
-
-static unsigned long mmap_vstart;
-static unsigned long *pending_vaddrs;
-static grant_handle_t *pending_grant_handles;
-
-static struct task blk_req_task;
-
-/* Protos */
-static void disconnect_ring(blkif_t *blkif);
-static int vbd_add_dev(struct xenbus_device *xdev);
-
-static inline int vaddr_pagenr(pending_req_t *req, int seg)
-{
- return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
-
-static inline unsigned long vaddr(pending_req_t *req, int seg)
-{
- return pending_vaddrs[vaddr_pagenr(req, seg)];
-}
-
-#define pending_handle(_req, _seg) \
- (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+struct xbb_ring_config {
+ /** KVA address where ring memory is mapped. */
+ vm_offset_t va;
+
+ /** The pseudo-physical address where ring memory is mapped.*/
+ uint64_t gnt_addr;
+
+ /**
+ * Grant table handles, one per-ring page, returned by the
+ * hyperpervisor upon mapping of the ring and required to
+ * unmap it when a connection is torn down.
+ */
+ grant_handle_t handle[XBB_MAX_RING_PAGES];
+
+ /**
+ * The device bus address returned by the hypervisor when
+ * mapping the ring and required to unmap it when a connection
+ * is torn down.
+ */
+ uint64_t bus_addr[XBB_MAX_RING_PAGES];
+
+ /** The number of ring pages mapped for the current connection. */
+ u_int ring_pages;
+
+ /**
+ * The grant references, one per-ring page, supplied by the
+ * front-end, allowing us to reference the ring pages in the
+ * front-end's domain and to map these pages into our own domain.
+ */
+ grant_ref_t ring_ref[XBB_MAX_RING_PAGES];
+
+ /** The interrupt driven even channel used to signal ring events. */
+ evtchn_port_t evtchn;
+};
-static unsigned long
-alloc_empty_page_range(unsigned long nr_pages)
+/**
+ * Per-instance connection state flags.
+ */
+typedef enum
{
- void *pages;
- int i = 0, j = 0;
- multicall_entry_t mcl[17];
- unsigned long mfn_list[16];
- struct xen_memory_reservation reservation = {
- .extent_start = mfn_list,
- .nr_extents = 0,
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
-
- pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT);
- if (pages == NULL)
- return 0;
+ /**
+ * The front-end requested a read-only mount of the
+ * back-end device/file.
+ */
+ XBBF_READ_ONLY = 0x01,
+
+ /** Communication with the front-end has been established. */
+ XBBF_RING_CONNECTED = 0x02,
+
+ /**
+ * Front-end requests exist in the ring and are waiting for
+ * xbb_xen_req objects to free up.
+ */
+ XBBF_RESOURCE_SHORTAGE = 0x04,
+
+ /** Connection teardown in progress. */
+ XBBF_SHUTDOWN = 0x08
+} xbb_flag_t;
+
+/** Backend device type. */
+typedef enum {
+ /** Backend type unknown. */
+ XBB_TYPE_NONE = 0x00,
+
+ /**
+ * Backend type disk (access via cdev switch
+ * strategy routine).
+ */
+ XBB_TYPE_DISK = 0x01,
+
+ /** Backend type file (access vnode operations.). */
+ XBB_TYPE_FILE = 0x02
+} xbb_type;
+
+/**
+ * \brief Structure used to memoize information about a per-request
+ * scatter-gather list.
+ *
+ * The chief benefit of using this data structure is it avoids having
+ * to reparse the possibly discontiguous S/G list in the original
+ * request. Due to the way that the mapping of the memory backing an
+ * I/O transaction is handled by Xen, a second pass is unavoidable.
+ * At least this way the second walk is a simple array traversal.
+ *
+ * \note A single Scatter/Gather element in the block interface covers
+ * at most 1 machine page. In this context a sector (blkif
+ * nomenclature, not what I'd choose) is a 512b aligned unit
+ * of mapping within the machine page referenced by an S/G
+ * element.
+ */
+struct xbb_sg {
+ /** The number of 512b data chunks mapped in this S/G element. */
+ int16_t nsect;
+
+ /**
+ * The index (0 based) of the first 512b data chunk mapped
+ * in this S/G element.
+ */
+ uint8_t first_sect;
+
+ /**
+ * The index (0 based) of the last 512b data chunk mapped
+ * in this S/G element.
+ */
+ uint8_t last_sect;
+};
- memset(mcl, 0, sizeof(mcl));
+/**
+ * Character device backend specific configuration data.
+ */
+struct xbb_dev_data {
+ /** Cdev used for device backend access. */
+ struct cdev *cdev;
- while (i < nr_pages) {
- unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE);
+ /** Cdev switch used for device backend access. */
+ struct cdevsw *csw;
- mcl[j].op = __HYPERVISOR_update_va_mapping;
- mcl[j].args[0] = va;
+ /** Used to hold a reference on opened cdev backend devices. */
+ int dev_ref;
+};
- mfn_list[j++] = vtomach(va) >> PAGE_SHIFT;
+/**
+ * File backend specific configuration data.
+ */
+struct xbb_file_data {
+ /** Credentials to use for vnode backed (file based) I/O. */
+ struct ucred *cred;
+
+ /**
+ * \brief Array of io vectors used to process file based I/O.
+ *
+ * Only a single file based request is outstanding per-xbb instance,
+ * so we only need one of these.
+ */
+ struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+#ifdef XBB_USE_BOUNCE_BUFFERS
+
+ /**
+ * \brief Array of io vectors used to handle bouncing of file reads.
+ *
+ * Vnode operations are free to modify uio data during their
+ * exectuion. In the case of a read with bounce buffering active,
+ * we need some of the data from the original uio in order to
+ * bounce-out the read data. This array serves as the temporary
+ * storage for this saved data.
+ */
+ struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+ /**
+ * \brief Array of memoized bounce buffer kva offsets used
+ * in the file based backend.
+ *
+ * Due to the way that the mapping of the memory backing an
+ * I/O transaction is handled by Xen, a second pass through
+ * the request sg elements is unavoidable. We memoize the computed
+ * bounce address here to reduce the cost of the second walk.
+ */
+ void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST];
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+};
- xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY;
+/**
+ * Collection of backend type specific data.
+ */
+union xbb_backend_data {
+ struct xbb_dev_data dev;
+ struct xbb_file_data file;
+};
- if (j == 16 || i == nr_pages) {
- mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL;
+/**
+ * Function signature of backend specific I/O handlers.
+ */
+typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg,
+ int operation, int flags);
- reservation.nr_extents = j;
+/**
+ * Per-instance configuration data.
+ */
+struct xbb_softc {
+
+ /**
+ * Task-queue used to process I/O requests.
+ */
+ struct taskqueue *io_taskqueue;
+
+ /**
+ * Single "run the request queue" task enqueued
+ * on io_taskqueue.
+ */
+ struct task io_task;
+
+ /** Device type for this instance. */
+ xbb_type device_type;
+
+ /** NewBus device corresponding to this instance. */
+ device_t dev;
+
+ /** Backend specific dispatch routine for this instance. */
+ xbb_dispatch_t dispatch_io;
+
+ /** The number of requests outstanding on the backend device/file. */
+ u_int active_request_count;
+
+ /** Free pool of request tracking structures. */
+ struct xbb_xen_req_slist request_free_slist;
+
+ /** Array, sized at connection time, of request tracking structures. */
+ struct xbb_xen_req *requests;
+
+ /**
+ * Global pool of kva used for mapping remote domain ring
+ * and I/O transaction data.
+ */
+ vm_offset_t kva;
+
+ /** Psuedo-physical address corresponding to kva. */
+ uint64_t gnt_base_addr;
+
+ /** The size of the global kva pool. */
+ int kva_size;
+
+ /**
+ * \brief Cached value of the front-end's domain id.
+ *
+ * This value is used at once for each mapped page in
+ * a transaction. We cache it to avoid incuring the
+ * cost of an ivar access every time this is needed.
+ */
+ domid_t otherend_id;
+
+ /**
+ * \brief The blkif protocol abi in effect.
+ *
+ * There are situations where the back and front ends can
+ * have a different, native abi (e.g. intel x86_64 and
+ * 32bit x86 domains on the same machine). The back-end
+ * always accomodates the front-end's native abi. That
+ * value is pulled from the XenStore and recorded here.
+ */
+ int abi;
+
+ /**
+ * \brief The maximum number of requests allowed to be in
+ * flight at a time.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_requests;
+
+ /**
+ * \brief The maximum number of segments (1 page per segment)
+ * that can be mapped by a request.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_request_segments;
+
+ /**
+ * The maximum size of any request to this back-end
+ * device.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_request_size;
+
+ /** Various configuration and state bit flags. */
+ xbb_flag_t flags;
+
+ /** Ring mapping and interrupt configuration data. */
+ struct xbb_ring_config ring_config;
+
+ /** Runtime, cross-abi safe, structures for ring access. */
+ blkif_back_rings_t rings;
+
+ /** IRQ mapping for the communication ring event channel. */
+ int irq;
+
+ /**
+ * \brief Backend access mode flags (e.g. write, or read-only).
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ */
+ char *dev_mode;
+
+ /**
+ * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ * Currently unused.
+ */
+ char *dev_type;
+
+ /**
+ * \brief Backend device/file identifier.
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ * We expect this to be a POSIX path indicating the file or
+ * device to open.
+ */
+ char *dev_name;
+
+ /**
+ * Vnode corresponding to the backend device node or file
+ * we are acessing.
+ */
+ struct vnode *vn;
+
+ union xbb_backend_data backend;
+ /** The native sector size of the backend. */
+ u_int sector_size;
+
+ /** log2 of sector_size. */
+ u_int sector_size_shift;
+
+ /** Size in bytes of the backend device or file. */
+ off_t media_size;
+
+ /**
+ * \brief media_size expressed in terms of the backend native
+ * sector size.
+ *
+ * (e.g. xbb->media_size >> xbb->sector_size_shift).
+ */
+ uint64_t media_num_sectors;
+
+ /**
+ * \brief Array of memoized scatter gather data computed during the
+ * conversion of blkif ring requests to internal xbb_xen_req
+ * structures.
+ *
+ * Ring processing is serialized so we only need one of these.
+ */
+ struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+ /** Mutex protecting per-instance data. */
+ struct mtx lock;
+
+#ifdef XENHVM
+ /**
+ * Resource representing allocated physical address space
+ * associated with our per-instance kva region.
+ */
+ struct resource *pseudo_phys_res;
+
+ /** Resource id for allocated physical address space. */
+ int pseudo_phys_res_id;
+#endif
- mcl[j].op = __HYPERVISOR_memory_op;
- mcl[j].args[0] = XENMEM_decrease_reservation;
- mcl[j].args[1] = (unsigned long)&reservation;
-
- (void)HYPERVISOR_multicall(mcl, j+1);
+ /** I/O statistics. */
+ struct devstat *xbb_stats;
+};
- mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0;
- j = 0;
+/*---------------------------- Request Processing ----------------------------*/
+/**
+ * Allocate an internal transaction tracking structure from the free pool.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return On success, a pointer to the allocated xbb_xen_req structure.
+ * Otherwise NULL.
+ */
+static inline struct xbb_xen_req *
+xbb_get_req(struct xbb_softc *xbb)
+{
+ struct xbb_xen_req *req;
+
+ req = NULL;
+ mtx_lock(&xbb->lock);
+
+ /*
+ * Do not allow new requests to be allocated while we
+ * are shutting down.
+ */
+ if ((xbb->flags & XBBF_SHUTDOWN) == 0) {
+ if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) {
+ SLIST_REMOVE_HEAD(&xbb->request_free_slist, links);
+ xbb->active_request_count++;
+ } else {
+ xbb->flags |= XBBF_RESOURCE_SHORTAGE;
}
}
-
- return (unsigned long)pages;
+ mtx_unlock(&xbb->lock);
+ return (req);
}
-static pending_req_t *
-alloc_req(void)
-{
- pending_req_t *req;
- mtx_lock(&pending_free_lock);
- if ((req = STAILQ_FIRST(&pending_free))) {
- STAILQ_REMOVE(&pending_free, req, pending_req, free_list);
- STAILQ_NEXT(req, free_list) = NULL;
- }
- mtx_unlock(&pending_free_lock);
- return req;
-}
-
-static void
-free_req(pending_req_t *req)
+/**
+ * Return an allocated transaction tracking structure to the free pool.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param req The request structure to free.
+ */
+static inline void
+xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
{
- int was_empty;
-
- mtx_lock(&pending_free_lock);
- was_empty = STAILQ_EMPTY(&pending_free);
- STAILQ_INSERT_TAIL(&pending_free, req, free_list);
- mtx_unlock(&pending_free_lock);
- if (was_empty)
- taskqueue_enqueue(taskqueue_swi, &blk_req_task);
-}
+ int wake_thread;
-static void
-fast_flush_area(pending_req_t *req)
-{
- struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned int i, invcount = 0;
- grant_handle_t handle;
- int ret;
+ mtx_lock(&xbb->lock);
+ wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE;
+ xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
+ SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+ xbb->active_request_count--;
- for (i = 0; i < req->nr_pages; i++) {
- handle = pending_handle(req, i);
- if (handle == BLKBACK_INVALID_HANDLE)
- continue;
- unmap[invcount].host_addr = vaddr(req, i);
- unmap[invcount].dev_bus_addr = 0;
- unmap[invcount].handle = handle;
- pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
- invcount++;
+ if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+ /*
+ * Shutdown is in progress. See if we can
+ * progress further now that one more request
+ * has completed and been returned to the
+ * free pool.
+ */
+ xbb_shutdown(xbb);
}
+ mtx_unlock(&xbb->lock);
- ret = HYPERVISOR_grant_table_op(
- GNTTABOP_unmap_grant_ref, unmap, invcount);
- PANIC_IF(ret);
+ if (wake_thread != 0)
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
-static void
-blkif_get(blkif_t *blkif)
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's kva region.
+ *
+ * \param req The request structure whose kva region will be accessed.
+ * \param pagenr The page index used to compute the kva offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * kva offset.
+ *
+ * \return The computed global KVA offset.
+ */
+static inline uint8_t *
+xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- atomic_add_int(&blkif->ref_cnt, 1);
+ return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9));
}
-static void
-blkif_put(blkif_t *blkif)
+#ifdef XBB_USE_BOUNCE_BUFFERS
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's local bounce memory region.
+ *
+ * \param req The request structure whose bounce region will be accessed.
+ * \param pagenr The page index used to compute the bounce offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * bounce offset.
+ *
+ * \return The computed global bounce buffer address.
+ */
+static inline uint8_t *
+xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector)
{
- if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) {
- DPRINTF("Removing %x\n", (unsigned int)blkif);
- disconnect_ring(blkif);
- if (blkif->mode)
- free(blkif->mode, M_DEVBUF);
- if (blkif->type)
- free(blkif->type, M_DEVBUF);
- if (blkif->dev_name)
- free(blkif->dev_name, M_DEVBUF);
- free(blkif, M_DEVBUF);
- }
+ return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
}
+#endif
-static int
-blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params)
+/**
+ * Given a page number and 512b sector offset within that page,
+ * calculate an offset into the request's memory region that the
+ * underlying backend device/file should use for I/O.
+ *
+ * \param req The request structure whose I/O region will be accessed.
+ * \param pagenr The page index used to compute the I/O offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * I/O offset.
+ *
+ * \return The computed global I/O address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uint8_t *
+xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- blkif_t *blkif;
-
- blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!blkif)
- return ENOMEM;
-
- DPRINTF("Created %x\n", (unsigned int)blkif);
-
- blkif->ref_cnt = 1;
- blkif->domid = xdev->otherend_id;
- blkif->handle = handle;
- blkif->mode = mode;
- blkif->type = type;
- blkif->dev_name = params;
- blkif->xdev = xdev;
- xdev->data = blkif;
-
- mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF);
-
- if (strcmp(mode, "w"))
- blkif->read_only = 1;
-
- return 0;
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ return (xbb_req_bounce_addr(req, pagenr, sector));
+#else
+ return (xbb_req_vaddr(req, pagenr, sector));
+#endif
}
-static void
-add_to_req_schedule_list_tail(blkif_t *blkif)
+/**
+ * Given a page index and 512b sector offset within that page, calculate
+ * an offset into the local psuedo-physical address space used to map a
+ * front-end's request data into a request.
+ *
+ * \param req The request structure whose pseudo-physical region
+ * will be accessed.
+ * \param pagenr The page index used to compute the pseudo-physical offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * pseudo-physical offset.
+ *
+ * \return The computed global pseudo-phsyical address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uintptr_t
+xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- if (!blkif->on_req_sched_list) {
- mtx_lock(&req_sched_list_lock);
- if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) {
- blkif_get(blkif);
- STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
- blkif->on_req_sched_list = 1;
- taskqueue_enqueue(taskqueue_swi, &blk_req_task);
- }
- mtx_unlock(&req_sched_list_lock);
- }
+ return ((uintptr_t)(req->gnt_base
+ + (PAGE_SIZE * pagenr) + (sector << 9)));
}
-/* This routine does not call blkif_get(), does not schedule the blk_req_task to run,
- and assumes that the state is connected */
+/**
+ * Unmap the front-end pages associated with this I/O request.
+ *
+ * \param req The request structure to unmap.
+ */
static void
-add_to_req_schedule_list_tail2(blkif_t *blkif)
+xbb_unmap_req(struct xbb_xen_req *req)
{
- mtx_lock(&req_sched_list_lock);
- if (!blkif->on_req_sched_list) {
- STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
- blkif->on_req_sched_list = 1;
- }
- mtx_unlock(&req_sched_list_lock);
-}
+ struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST];
+ u_int i;
+ u_int invcount;
+ int error;
-/* Removes blkif from front of list and does not call blkif_put() (caller must) */
-static blkif_t *
-remove_from_req_schedule_list(void)
-{
- blkif_t *blkif;
+ invcount = 0;
+ for (i = 0; i < req->nr_pages; i++) {
- mtx_lock(&req_sched_list_lock);
+ if (req->gnt_handles[i] == GRANT_REF_INVALID)
+ continue;
- if ((blkif = STAILQ_FIRST(&req_sched_list))) {
- STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req);
- STAILQ_NEXT(blkif, next_req) = NULL;
- blkif->on_req_sched_list = 0;
+ unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0);
+ unmap[invcount].dev_bus_addr = 0;
+ unmap[invcount].handle = req->gnt_handles[i];
+ req->gnt_handles[i] = GRANT_REF_INVALID;
+ invcount++;
}
- mtx_unlock(&req_sched_list_lock);
-
- return blkif;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, invcount);
+ KASSERT(error == 0, ("Grant table operation failed"));
}
+/**
+ * Create and transmit a response to a blkif request.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param req The request structure to which to respond.
+ * \param status The status code to report. See BLKIF_RSP_*
+ * in sys/xen/interface/io/blkif.h.
+ */
static void
-make_response(blkif_t *blkif, uint64_t id,
- unsigned short op, int st)
+xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
{
blkif_response_t *resp;
- blkif_back_ring_t *blk_ring = &blkif->ring;
- int more_to_do = 0;
- int notify;
+ int more_to_do;
+ int notify;
+
+ more_to_do = 0;
+
+ /*
+ * Place on the response ring for the relevant domain.
+ * For now, only the spacing between entries is different
+ * in the different ABIs, not the response entry layout.
+ */
+ mtx_lock(&xbb->lock);
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ resp = RING_GET_RESPONSE(&xbb->rings.native,
+ xbb->rings.native.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ resp = (blkif_response_t *)
+ RING_GET_RESPONSE(&xbb->rings.x86_32,
+ xbb->rings.x86_32.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ resp = (blkif_response_t *)
+ RING_GET_RESPONSE(&xbb->rings.x86_64,
+ xbb->rings.x86_64.rsp_prod_pvt);
+ break;
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ }
- mtx_lock(&blkif->blk_ring_lock);
+ resp->id = req->id;
+ resp->operation = req->operation;
+ resp->status = status;
+ xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
- /* Place on the response ring for the relevant domain. */
- resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
- resp->id = id;
- resp->operation = op;
- resp->status = st;
- blk_ring->rsp_prod_pvt++;
- RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
+ if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
- if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
/*
* Tail check for pending requests. Allows frontend to avoid
* notifications if requests are already in flight (lower
* overheads and promotes batching).
*/
- RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
+ RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
- } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring))
more_to_do = 1;
+ }
- mtx_unlock(&blkif->blk_ring_lock);
+ mtx_unlock(&xbb->lock);
if (more_to_do)
- add_to_req_schedule_list_tail(blkif);
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
if (notify)
- notify_remote_via_irq(blkif->irq);
+ notify_remote_via_irq(xbb->irq);
}
+/**
+ * Completion handler for buffer I/O requests issued by the device
+ * backend driver.
+ *
+ * \param bio The buffer I/O request on which to perform completion
+ * processing.
+ */
static void
-end_block_io_op(struct bio *bio)
+xbb_bio_done(struct bio *bio)
{
- pending_req_t *pending_req = bio->bio_caller2;
+ struct xbb_softc *xbb;
+ struct xbb_xen_req *req;
+
+ req = bio->bio_caller1;
+ xbb = req->xbb;
+ /* Only include transferred I/O in stats. */
+ req->nr_512b_sectors -= bio->bio_resid >> 9;
if (bio->bio_error) {
DPRINTF("BIO returned error %d for operation on device %s\n",
- bio->bio_error, pending_req->blkif->dev_name);
- pending_req->status = BLKIF_RSP_ERROR;
- pending_req->blkif->st_err_req++;
+ bio->bio_error, xbb->dev_name);
+ req->status = BLKIF_RSP_ERROR;
+
+ if (bio->bio_error == ENXIO
+ && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
+
+ /*
+ * Backend device has disappeared. Signal the
+ * front-end that we (the device proxy) want to
+ * go away.
+ */
+ xenbus_set_state(xbb->dev, XenbusStateClosing);
+ }
}
-#if 0
- printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n",
- (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags);
-#endif
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ if (bio->bio_cmd == BIO_READ) {
+ vm_offset_t kva_offset;
- if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) {
- fast_flush_area(pending_req);
- make_response(pending_req->blkif, pending_req->id,
- pending_req->operation, pending_req->status);
- blkif_put(pending_req->blkif);
- free_req(pending_req);
+ kva_offset = (vm_offset_t)bio->bio_data
+ - (vm_offset_t)req->bounce;
+ memcpy((uint8_t *)req->kva + kva_offset,
+ bio->bio_data, bio->bio_bcount);
+ }
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+ if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) {
+ xbb_unmap_req(req);
+ xbb_send_response(xbb, req, req->status);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/req->nr_512b_sectors << 9,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
+ xbb_release_req(xbb, req);
}
g_destroy_bio(bio);
}
+/**
+ * Parse a blkif request into an internal request structure and send
+ * it to the backend for processing.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param req_ring_idx The location of ring_req within the shared
+ * communication ring.
+ *
+ * This routine performs the backend common aspects of request parsing
+ * including compiling an internal request structure, parsing the S/G
+ * list and any secondary ring requests in which they may reside, and
+ * the mapping of front-end I/O pages into our domain.
+ */
static void
-dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req)
+xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, RING_IDX req_ring_idx)
{
- struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct {
- unsigned long buf; unsigned int nsec;
- } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned int nseg = req->nr_segments, nr_sects = 0;
- struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- int operation, ret, i, nbio = 0;
+ struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST];
+ struct xbb_sg *xbb_sg;
+ struct gnttab_map_grant_ref *map;
+ struct blkif_request_segment *sg;
+ struct blkif_request_segment *last_block_sg;
+ u_int nseg;
+ u_int seg_idx;
+ u_int block_segs;
+ int nr_sects;
+ int operation;
+ uint8_t bio_flags;
+ int error;
+
+ nseg = ring_req->nr_segments;
+ nr_sects = 0;
+ req->xbb = xbb;
+ req->id = ring_req->id;
+ req->operation = ring_req->operation;
+ req->status = BLKIF_RSP_OKAY;
+ req->ds_tag_type = DEVSTAT_TAG_SIMPLE;
+ req->nr_pages = nseg;
+ req->nr_512b_sectors = 0;
+ bio_flags = 0;
+ sg = NULL;
+
+ binuptime(&req->ds_t0);
+ devstat_start_transaction(xbb->xbb_stats, &req->ds_t0);
+
+ switch (req->operation) {
+ case BLKIF_OP_WRITE_BARRIER:
+ bio_flags |= BIO_ORDERED;
+ req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+ /* FALLTHROUGH */
+ case BLKIF_OP_WRITE:
+ operation = BIO_WRITE;
+ req->ds_trans_type = DEVSTAT_WRITE;
+ if ((xbb->flags & XBBF_READ_ONLY) != 0) {
+ DPRINTF("Attempt to write to read only device %s\n",
+ xbb->dev_name);
+ goto fail_send_response;
+ }
+ break;
+ case BLKIF_OP_READ:
+ operation = BIO_READ;
+ req->ds_trans_type = DEVSTAT_READ;
+ break;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ operation = BIO_FLUSH;
+ req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+ req->ds_trans_type = DEVSTAT_NO_DATA;
+ goto do_dispatch;
+ /*NOTREACHED*/
+ default:
+ DPRINTF("error: unknown block io operation [%d]\n",
+ req->operation);
+ goto fail_send_response;
+ }
/* Check that number of segments is sane. */
- if (unlikely(nseg == 0) ||
- unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ if (unlikely(nseg == 0)
+ || unlikely(nseg > xbb->max_request_segments)) {
DPRINTF("Bad number of segments in request (%d)\n", nseg);
- goto fail_response;
+ goto fail_send_response;
}
- if (req->operation == BLKIF_OP_WRITE) {
- if (blkif->read_only) {
- DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name);
- goto fail_response;
- }
- operation = BIO_WRITE;
- } else
- operation = BIO_READ;
-
- pending_req->blkif = blkif;
- pending_req->id = req->id;
- pending_req->operation = req->operation;
- pending_req->status = BLKIF_RSP_OKAY;
- pending_req->nr_pages = nseg;
+ map = maps;
+ xbb_sg = xbb->xbb_sgs;
+ block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
+ sg = ring_req->seg;
+ last_block_sg = sg + block_segs;
+ seg_idx = 0;
+ while (1) {
- for (i = 0; i < nseg; i++) {
- seg[i].nsec = req->seg[i].last_sect -
- req->seg[i].first_sect + 1;
+ while (sg < last_block_sg) {
+
+ xbb_sg->first_sect = sg->first_sect;
+ xbb_sg->last_sect = sg->last_sect;
+ xbb_sg->nsect =
+ (int8_t)(sg->last_sect - sg->first_sect + 1);
+
+ if ((sg->last_sect >= (PAGE_SIZE >> 9))
+ || (xbb_sg->nsect <= 0))
+ goto fail_send_response;
+
+ nr_sects += xbb_sg->nsect;
+ map->host_addr = xbb_req_gntaddr(req, seg_idx,
+ /*sector*/0);
+ map->flags = GNTMAP_host_map;
+ map->ref = sg->gref;
+ map->dom = xbb->otherend_id;
+ if (operation == BIO_WRITE)
+ map->flags |= GNTMAP_readonly;
+ sg++;
+ map++;
+ xbb_sg++;
+ seg_idx++;
+ }
- if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
- (seg[i].nsec <= 0))
- goto fail_response;
- nr_sects += seg[i].nsec;
+ block_segs = MIN(nseg - seg_idx,
+ BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
+ if (block_segs == 0)
+ break;
- map[i].host_addr = vaddr(pending_req, i);
- map[i].dom = blkif->domid;
- map[i].ref = req->seg[i].gref;
- map[i].flags = GNTMAP_host_map;
- if (operation == BIO_WRITE)
- map[i].flags |= GNTMAP_readonly;
+ /*
+ * Fetch the next request block full of SG elements.
+ * For now, only the spacing between entries is different
+ * in the different ABIs, not the sg entry layout.
+ */
+ req_ring_idx++;
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
+ req_ring_idx);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
+ req_ring_idx);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
+ req_ring_idx);
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ /* NOTREACHED */
+ }
+ last_block_sg = sg + block_segs;
}
/* Convert to the disk's sector size */
- nr_sects = (nr_sects << 9) >> blkif->sector_size_shift;
-
- ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
- PANIC_IF(ret);
+ req->nr_512b_sectors = nr_sects;
+ nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
+
+ if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) {
+ device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple "
+ "of the backing store sector size (%d)\n",
+ __func__, req->nr_512b_sectors << 9,
+ xbb->sector_size);
+ goto fail_send_response;
+ }
- for (i = 0; i < nseg; i++) {
- if (unlikely(map[i].status != 0)) {
- DPRINTF("invalid buffer -- could not remap it\n");
- goto fail_flush;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ maps, req->nr_pages);
+ if (error != 0)
+ panic("Grant table operation failed (%d)", error);
+
+ for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) {
+
+ if (unlikely(map->status != 0)) {
+ DPRINTF("invalid buffer -- could not remap it (%d)\n",
+ map->status);
+ DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x "
+ "ref 0x%x, dom %d\n", seg_idx,
+ map->host_addr, map->flags, map->ref,
+ map->dom);
+ goto fail_unmap_req;
}
- pending_handle(pending_req, i) = map[i].handle;
-#if 0
- /* Can't do this in FreeBSD since vtophys() returns the pfn */
- /* of the remote domain who loaned us the machine page - DPT */
- xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] =
- map[i]dev_bus_addr >> PAGE_SHIFT;
-#endif
- seg[i].buf = map[i].dev_bus_addr |
- (req->seg[i].first_sect << 9);
+ req->gnt_handles[seg_idx] = map->handle;
}
+ if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) {
- if (req->sector_number + nr_sects > blkif->media_num_sectors) {
- DPRINTF("%s of [%llu,%llu] extends past end of device %s\n",
+ DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
+ "extends past end of device %s\n",
operation == BIO_READ ? "read" : "write",
- req->sector_number,
- req->sector_number + nr_sects, blkif->dev_name);
- goto fail_flush;
+ ring_req->sector_number,
+ ring_req->sector_number + nr_sects, xbb->dev_name);
+ goto fail_unmap_req;
}
- for (i = 0; i < nseg; i++) {
- struct bio *bio;
-
- if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) {
- DPRINTF("Misaligned I/O request from domain %d", blkif->domid);
- goto fail_put_bio;
- }
-
- bio = biolist[nbio++] = g_new_bio();
- if (unlikely(bio == NULL))
- goto fail_put_bio;
+do_dispatch:
- bio->bio_cmd = operation;
- bio->bio_offset = req->sector_number << blkif->sector_size_shift;
- bio->bio_length = seg[i].nsec << 9;
- bio->bio_bcount = bio->bio_length;
- bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK));
- bio->bio_done = end_block_io_op;
- bio->bio_caller2 = pending_req;
- bio->bio_dev = blkif->cdev;
+ error = xbb->dispatch_io(xbb,
+ ring_req,
+ req,
+ nseg,
+ operation,
+ bio_flags);
- req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift;
-#if 0
- printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n",
- (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec,
- blkif->cdev->si_iosize_max, seg[i].buf);
-#endif
+ if (error != 0) {
+ if (operation == BIO_FLUSH)
+ goto fail_send_response;
+ else
+ goto fail_unmap_req;
}
- pending_req->pendcnt = nbio;
- blkif_get(blkif);
+ return;
- for (i = 0; i < nbio; i++)
- (*blkif->csw->d_strategy)(biolist[i]);
- return;
+fail_unmap_req:
+ xbb_unmap_req(req);
+ /* FALLTHROUGH */
- fail_put_bio:
- for (i = 0; i < (nbio-1); i++)
- g_destroy_bio(biolist[i]);
- fail_flush:
- fast_flush_area(pending_req);
- fail_response:
- make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
- free_req(pending_req);
+fail_send_response:
+ xbb_send_response(xbb, req, BLKIF_RSP_ERROR);
+ xbb_release_req(xbb, req);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/0,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
}
+/**
+ * Process incoming requests from the shared communication ring in response
+ * to a signal on the ring's event channel.
+ *
+ * \param context Callback argument registerd during task initialization -
+ * the xbb_softc for this instance.
+ * \param pending The number of taskqueue_enqueue events that have
+ * occurred since this handler was last run.
+ */
static void
-blk_req_action(void *context, int pending)
+xbb_run_queue(void *context, int pending)
{
- blkif_t *blkif;
-
- DPRINTF("\n");
-
- while (!STAILQ_EMPTY(&req_sched_list)) {
- blkif_back_ring_t *blk_ring;
- RING_IDX rc, rp;
-
- blkif = remove_from_req_schedule_list();
-
- blk_ring = &blkif->ring;
- rc = blk_ring->req_cons;
- rp = blk_ring->sring->req_prod;
- rmb(); /* Ensure we see queued requests up to 'rp'. */
-
- while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
- blkif_request_t *req;
- pending_req_t *pending_req;
-
- pending_req = alloc_req();
- if (pending_req == NULL)
- goto out_of_preqs;
-
- req = RING_GET_REQUEST(blk_ring, rc);
- blk_ring->req_cons = ++rc; /* before make_response() */
-
- switch (req->operation) {
- case BLKIF_OP_READ:
- blkif->st_rd_req++;
- dispatch_rw_block_io(blkif, req, pending_req);
- break;
- case BLKIF_OP_WRITE:
- blkif->st_wr_req++;
- dispatch_rw_block_io(blkif, req, pending_req);
- break;
- default:
- blkif->st_err_req++;
- DPRINTF("error: unknown block io operation [%d]\n",
- req->operation);
- make_response(blkif, req->id, req->operation,
- BLKIF_RSP_ERROR);
- free_req(pending_req);
- break;
- }
+ struct xbb_softc *xbb;
+ blkif_back_rings_t *rings;
+ RING_IDX rp;
+
+
+ xbb = (struct xbb_softc *)context;
+ rings = &xbb->rings;
+
+ /*
+ * Cache req_prod to avoid accessing a cache line shared
+ * with the frontend.
+ */
+ rp = rings->common.sring->req_prod;
+
+ /* Ensure we see queued requests up to 'rp'. */
+ rmb();
+
+ /**
+ * Run so long as there is work to consume and the generation
+ * of a response will not overflow the ring.
+ *
+ * @note There's a 1 to 1 relationship between requests and responses,
+ * so an overflow should never occur. This test is to protect
+ * our domain from digesting bogus data. Shouldn't we log this?
+ */
+ while (rings->common.req_cons != rp
+ && RING_REQUEST_CONS_OVERFLOW(&rings->common,
+ rings->common.req_cons) == 0) {
+ blkif_request_t ring_req_storage;
+ blkif_request_t *ring_req;
+ struct xbb_xen_req *req;
+ RING_IDX req_ring_idx;
+
+ req = xbb_get_req(xbb);
+ if (req == NULL) {
+ /*
+ * Resource shortage has been recorded.
+ * We'll be scheduled to run once a request
+ * object frees up due to a completion.
+ */
+ break;
}
- blkif_put(blkif);
- }
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ ring_req = RING_GET_REQUEST(&xbb->rings.native,
+ rings->common.req_cons);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ struct blkif_x86_32_request *ring_req32;
+
+ ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32,
+ rings->common.req_cons);
+ blkif_get_x86_32_req(&ring_req_storage, ring_req32);
+ ring_req = &ring_req_storage;
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ struct blkif_x86_64_request *ring_req64;
+
+ ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64,
+ rings->common.req_cons);
+ blkif_get_x86_64_req(&ring_req_storage, ring_req64);
+ ring_req = &ring_req_storage;
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ /* NOTREACHED */
+ }
- return;
+ /*
+ * Signify that we can overwrite this request with a
+ * response by incrementing our consumer index. The
+ * response won't be generated until after we've already
+ * consumed all necessary data out of the version of the
+ * request in the ring buffer (for native mode). We
+ * must update the consumer index before issueing back-end
+ * I/O so there is no possibility that it will complete
+ * and a response be generated before we make room in
+ * the queue for that response.
+ */
+ req_ring_idx = xbb->rings.common.req_cons;
+ xbb->rings.common.req_cons +=
+ BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
- out_of_preqs:
- /* We ran out of pending req structs */
- /* Just requeue interface and wait to be rescheduled to run when one is freed */
- add_to_req_schedule_list_tail2(blkif);
- blkif->st_oo_req++;
+ xbb_dispatch_io(xbb, ring_req, req, req_ring_idx);
+ }
}
-/* Handle interrupt from a frontend */
+/**
+ * Interrupt handler bound to the shared ring's event channel.
+ *
+ * \param arg Callback argument registerd during event channel
+ * binding - the xbb_softc for this instance.
+ */
static void
-blkback_intr(void *arg)
+xbb_intr(void *arg)
{
- blkif_t *blkif = arg;
- DPRINTF("%x\n", (unsigned int)blkif);
- add_to_req_schedule_list_tail(blkif);
+ struct xbb_softc *xbb;
+
+ /* Defer to kernel thread. */
+ xbb = (struct xbb_softc *)arg;
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
-/* Map grant ref for ring */
+/*----------------------------- Backend Handlers -----------------------------*/
+/**
+ * Backend handler for character device access.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param nseg The number of valid segments for this request in
+ * xbb->xbb_sgs.
+ * \param operation BIO_* I/O operation code.
+ * \param bio_flags Additional bio_flag data to pass to any generated
+ * bios (e.g. BIO_ORDERED)..
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring)
+xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg, int operation,
+ int bio_flags)
{
- struct gnttab_map_grant_ref op;
+ struct xbb_dev_data *dev_data;
+ struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST];
+ off_t bio_offset;
+ struct bio *bio;
+ struct xbb_sg *xbb_sg;
+ u_int nbio;
+ u_int bio_idx;
+ u_int seg_idx;
+ int error;
+
+ dev_data = &xbb->backend.dev;
+ bio_offset = (off_t)ring_req->sector_number
+ << xbb->sector_size_shift;
+ error = 0;
+ nbio = 0;
+ bio_idx = 0;
+
+ if (operation == BIO_FLUSH) {
+ bio = g_new_bio();
+ if (unlikely(bio == NULL)) {
+ DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
+ error = ENOMEM;
+ return (error);
+ }
+
+ bio->bio_cmd = BIO_FLUSH;
+ bio->bio_flags |= BIO_ORDERED;
+ bio->bio_dev = dev_data->cdev;
+ bio->bio_offset = 0;
+ bio->bio_data = 0;
+ bio->bio_done = xbb_bio_done;
+ bio->bio_caller1 = req;
+ bio->bio_pblkno = 0;
- ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
- if (ring->va == 0)
- return ENOMEM;
+ req->pendcnt = 1;
- op.host_addr = ring->va;
- op.flags = GNTMAP_host_map;
- op.ref = ref;
- op.dom = dom;
- HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
- if (op.status) {
- WPRINTF("grant table op err=%d\n", op.status);
- kmem_free(kernel_map, ring->va, PAGE_SIZE);
- ring->va = 0;
- return EACCES;
+ (*dev_data->csw->d_strategy)(bios[bio_idx]);
+
+ return (0);
}
- ring->handle = op.handle;
- ring->bus_addr = op.dev_bus_addr;
+ for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs;
+ seg_idx < nseg;
+ seg_idx++, xbb_sg++) {
- return 0;
-}
+ /*
+ * KVA will not be contiguous, so any additional
+ * I/O will need to be represented in a new bio.
+ */
+ if ((bio != NULL)
+ && (xbb_sg->first_sect != 0)) {
+ if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Discontiguous I/O request from "
+ "domain %d ends on non-sector "
+ "boundary\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
+ bio = NULL;
+ }
-/* Unmap grant ref for ring */
-static void
-unmap_ring(struct ring_ref *ring)
-{
- struct gnttab_unmap_grant_ref op;
+ if (bio == NULL) {
+ /*
+ * Make sure that the start of this bio is aligned
+ * to a device sector.
+ */
+ if ((bio_offset & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Misaligned I/O request from "
+ "domain %d\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
- op.host_addr = ring->va;
- op.dev_bus_addr = ring->bus_addr;
- op.handle = ring->handle;
- HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
- if (op.status)
- WPRINTF("grant table op err=%d\n", op.status);
+ bio = bios[nbio++] = g_new_bio();
+ if (unlikely(bio == NULL)) {
+ error = ENOMEM;
+ goto fail_free_bios;
+ }
+ bio->bio_cmd = operation;
+ bio->bio_flags |= bio_flags;
+ bio->bio_dev = dev_data->cdev;
+ bio->bio_offset = bio_offset;
+ bio->bio_data = xbb_req_ioaddr(req, seg_idx,
+ xbb_sg->first_sect);
+ bio->bio_done = xbb_bio_done;
+ bio->bio_caller1 = req;
+ bio->bio_pblkno = bio_offset
+ >> xbb->sector_size_shift;
+ }
- kmem_free(kernel_map, ring->va, PAGE_SIZE);
- ring->va = 0;
-}
+ bio->bio_length += xbb_sg->nsect << 9;
+ bio->bio_bcount = bio->bio_length;
+ bio_offset += xbb_sg->nsect << 9;
-static int
-connect_ring(blkif_t *blkif)
-{
- struct xenbus_device *xdev = blkif->xdev;
- blkif_sring_t *ring;
- unsigned long ring_ref;
- evtchn_port_t evtchn;
- evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
- int err;
-
- if (blkif->ring_connected)
- return 0;
-
- // Grab FE data and map his memory
- err = xenbus_gather(NULL, xdev->otherend,
- "ring-ref", "%lu", &ring_ref,
- "event-channel", "%u", &evtchn, NULL);
- if (err) {
- xenbus_dev_fatal(xdev, err,
- "reading %s/ring-ref and event-channel",
- xdev->otherend);
- return err;
- }
-
- err = map_ring(ring_ref, blkif->domid, &blkif->rr);
- if (err) {
- xenbus_dev_fatal(xdev, err, "mapping ring");
- return err;
- }
- ring = (blkif_sring_t *)blkif->rr.va;
- BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE);
-
- op.u.bind_interdomain.remote_dom = blkif->domid;
- op.u.bind_interdomain.remote_port = evtchn;
- err = HYPERVISOR_event_channel_op(&op);
- if (err) {
- unmap_ring(&blkif->rr);
- xenbus_dev_fatal(xdev, err, "binding event channel");
- return err;
- }
- blkif->evtchn = op.u.bind_interdomain.local_port;
-
- /* bind evtchn to irq handler */
- blkif->irq =
- bind_evtchn_to_irqhandler(blkif->evtchn, "blkback",
- blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie);
-
- blkif->ring_connected = 1;
-
- DPRINTF("%x rings connected! evtchn=%d irq=%d\n",
- (unsigned int)blkif, blkif->evtchn, blkif->irq);
+ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
- return 0;
-}
+ if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Discontiguous I/O request from "
+ "domain %d ends on non-sector "
+ "boundary\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
+ /*
+ * KVA will not be contiguous, so any additional
+ * I/O will need to be represented in a new bio.
+ */
+ bio = NULL;
+ }
+ }
-static void
-disconnect_ring(blkif_t *blkif)
-{
- DPRINTF("\n");
+ req->pendcnt = nbio;
+
+ for (bio_idx = 0; bio_idx < nbio; bio_idx++)
+ {
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ vm_offset_t kva_offset;
- if (blkif->ring_connected) {
- unbind_from_irqhandler(blkif->irq, blkif->irq_cookie);
- blkif->irq = 0;
- unmap_ring(&blkif->rr);
- blkif->ring_connected = 0;
+ kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
+ - (vm_offset_t)req->bounce;
+ if (operation == BIO_WRITE) {
+ memcpy(bios[bio_idx]->bio_data,
+ (uint8_t *)req->kva + kva_offset,
+ bios[bio_idx]->bio_bcount);
+ }
+#endif
+ (*dev_data->csw->d_strategy)(bios[bio_idx]);
}
+
+ return (error);
+
+fail_free_bios:
+ for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
+ g_destroy_bio(bios[bio_idx]);
+
+ return (error);
}
-static void
-connect(blkif_t *blkif)
+/**
+ * Backend handler for file access.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param nseg The number of valid segments for this request in
+ * xbb->xbb_sgs.
+ * \param operation BIO_* I/O operation code.
+ * \param bio_flags Additional bio_flag data to pass to any generated bios
+ * (e.g. BIO_ORDERED)..
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg, int operation,
+ int flags)
{
- struct xenbus_transaction *xbt;
- struct xenbus_device *xdev = blkif->xdev;
- int err;
+ struct xbb_file_data *file_data;
+ u_int seg_idx;
+ struct uio xuio;
+ struct xbb_sg *xbb_sg;
+ struct iovec *xiovec;
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ void **p_vaddr;
+ int saved_uio_iovcnt;
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ int vfs_is_locked;
+ int error;
+
+ file_data = &xbb->backend.file;
+ error = 0;
+ bzero(&xuio, sizeof(xuio));
+
+ req->pendcnt = 0;
+
+ switch (operation) {
+ case BIO_READ:
+ xuio.uio_rw = UIO_READ;
+ break;
+ case BIO_WRITE:
+ xuio.uio_rw = UIO_WRITE;
+ break;
+ case BIO_FLUSH: {
+ struct mount *mountpoint;
- if (!blkif->ring_connected ||
- blkif->vn == NULL ||
- blkif->state == XenbusStateConnected)
- return;
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
- DPRINTF("%s\n", xdev->otherend);
+ (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
- /* Supply the information about the device the frontend needs */
-again:
- xbt = xenbus_transaction_start();
- if (IS_ERR(xbt)) {
- xenbus_dev_fatal(xdev, PTR_ERR(xbt),
- "Error writing configuration for backend "
- "(start transaction)");
- return;
- }
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
+ VOP_UNLOCK(xbb->vn, 0);
- err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u",
- blkif->media_num_sectors);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/sectors",
- xdev->nodename);
- goto abort;
- }
+ vn_finished_write(mountpoint);
+
+ VFS_UNLOCK_GIANT(vfs_is_locked);
- err = xenbus_printf(xbt, xdev->nodename, "info", "%u",
- blkif->read_only ? VDISK_READONLY : 0);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/info",
- xdev->nodename);
- goto abort;
+ goto bailout_send_response;
+ /* NOTREACHED */
}
- err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u",
- blkif->sector_size);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/sector-size",
- xdev->nodename);
- goto abort;
+ default:
+ panic("invalid operation %d", operation);
+ /* NOTREACHED */
}
+ xuio.uio_offset = (vm_offset_t)ring_req->sector_number
+ << xbb->sector_size_shift;
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN)
- goto again;
- if (err)
- xenbus_dev_fatal(xdev, err, "ending transaction");
+ xuio.uio_segflg = UIO_SYSSPACE;
+ xuio.uio_iov = file_data->xiovecs;
+ xuio.uio_iovcnt = 0;
- err = xenbus_switch_state(xdev, NULL, XenbusStateConnected);
- if (err)
- xenbus_dev_fatal(xdev, err, "switching to Connected state",
- xdev->nodename);
+ for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs;
+ seg_idx < nseg; seg_idx++, xbb_sg++) {
- blkif->state = XenbusStateConnected;
+ /*
+ * If the first sector is not 0, the KVA will not be
+ * contiguous and we'll need to go on to another segment.
+ */
+ if (xbb_sg->first_sect != 0)
+ xiovec = NULL;
+
+ if (xiovec == NULL) {
+ xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
+ xiovec->iov_base = xbb_req_ioaddr(req, seg_idx,
+ xbb_sg->first_sect);
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /*
+ * Store the address of the incoming buffer at this
+ * particular offset as well, so we can do the copy
+ * later without having to do more work to
+ * recalculate this address.
+ */
+ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
+ *p_vaddr = xbb_req_vaddr(req, seg_idx,
+ xbb_sg->first_sect);
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ xiovec->iov_len = 0;
+ xuio.uio_iovcnt++;
+ }
- return;
+ xiovec->iov_len += xbb_sg->nsect << 9;
- abort:
- xenbus_transaction_end(xbt, 1);
-}
+ xuio.uio_resid += xbb_sg->nsect << 9;
-static int
-blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id)
-{
- int err;
- char *p, *mode = NULL, *type = NULL, *params = NULL;
- long handle;
+ /*
+ * If the last sector is not the full page size count,
+ * the next segment will not be contiguous in KVA and we
+ * need a new iovec.
+ */
+ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
+ xiovec = NULL;
+ }
- DPRINTF("node=%s\n", xdev->nodename);
+ xuio.uio_td = curthread;
- p = strrchr(xdev->otherend, '/') + 1;
- handle = strtoul(p, NULL, 0);
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ saved_uio_iovcnt = xuio.uio_iovcnt;
- mode = xenbus_read(NULL, xdev->nodename, "mode", NULL);
- if (IS_ERR(mode)) {
- xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode");
- err = PTR_ERR(mode);
- goto error;
- }
-
- type = xenbus_read(NULL, xdev->nodename, "type", NULL);
- if (IS_ERR(type)) {
- xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type");
- err = PTR_ERR(type);
- goto error;
- }
-
- params = xenbus_read(NULL, xdev->nodename, "params", NULL);
- if (IS_ERR(type)) {
- xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params");
- err = PTR_ERR(params);
- goto error;
- }
-
- err = blkif_create(xdev, handle, mode, type, params);
- if (err) {
- xenbus_dev_fatal(xdev, err, "creating blkif");
- goto error;
- }
+ if (operation == BIO_WRITE) {
+ /* Copy the write data to the local buffer. */
+ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+ xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
+ seg_idx++, xiovec++, p_vaddr++) {
- err = vbd_add_dev(xdev);
- if (err) {
- blkif_put((blkif_t *)xdev->data);
- xenbus_dev_fatal(xdev, err, "adding vbd device");
+ memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
+ }
+ } else {
+ /*
+ * We only need to save off the iovecs in the case of a
+ * read, because the copy for the read happens after the
+ * VOP_READ(). (The uio will get modified in that call
+ * sequence.)
+ */
+ memcpy(file_data->saved_xiovecs, xuio.uio_iov,
+ xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
- return err;
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+ switch (operation) {
+ case BIO_READ:
- error:
- if (mode)
- free(mode, M_DEVBUF);
- if (type)
- free(type, M_DEVBUF);
- if (params)
- free(params, M_DEVBUF);
- return err;
-}
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
-static int
-blkback_remove(struct xenbus_device *xdev)
-{
- blkif_t *blkif = xdev->data;
- device_t ndev;
+ /*
+ * UFS pays attention to IO_DIRECT for reads. If the
+ * DIRECTIO option is configured into the kernel, it calls
+ * ffs_rawread(). But that only works for single-segment
+ * uios with user space addresses. In our case, with a
+ * kernel uio, it still reads into the buffer cache, but it
+ * will just try to release the buffer from the cache later
+ * on in ffs_read().
+ *
+ * ZFS does not pay attention to IO_DIRECT for reads.
+ *
+ * UFS does not pay attention to IO_SYNC for reads.
+ *
+ * ZFS pays attention to IO_SYNC (which translates into the
+ * Solaris define FRSYNC for zfs_read()) for reads. It
+ * attempts to sync the file before reading.
+ *
+ * So, to attempt to provide some barrier semantics in the
+ * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
+ */
+ error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
+ (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
- DPRINTF("node=%s\n", xdev->nodename);
+ VOP_UNLOCK(xbb->vn, 0);
+ break;
+ case BIO_WRITE: {
+ struct mount *mountpoint;
- blkif->state = XenbusStateClosing;
+ (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
- if ((ndev = blkif->ndev)) {
- blkif->ndev = NULL;
- mtx_lock(&Giant);
- device_detach(ndev);
- mtx_unlock(&Giant);
- }
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
- xdev->data = NULL;
- blkif->xdev = NULL;
- blkif_put(blkif);
+ /*
+ * UFS pays attention to IO_DIRECT for writes. The write
+ * is done asynchronously. (Normally the write would just
+ * get put into cache.
+ *
+ * UFS pays attention to IO_SYNC for writes. It will
+ * attempt to write the buffer out synchronously if that
+ * flag is set.
+ *
+ * ZFS does not pay attention to IO_DIRECT for writes.
+ *
+ * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
+ * for writes. It will flush the transaction from the
+ * cache before returning.
+ *
+ * So if we've got the BIO_ORDERED flag set, we want
+ * IO_SYNC in either the UFS or ZFS case.
+ */
+ error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
+ IO_SYNC : 0, file_data->cred);
+ VOP_UNLOCK(xbb->vn, 0);
- return 0;
-}
+ vn_finished_write(mountpoint);
-static int
-blkback_resume(struct xenbus_device *xdev)
-{
- DPRINTF("node=%s\n", xdev->nodename);
- return 0;
+ break;
+ }
+ default:
+ panic("invalid operation %d", operation);
+ /* NOTREACHED */
+ }
+ VFS_UNLOCK_GIANT(vfs_is_locked);
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /* We only need to copy here for read operations */
+ if (operation == BIO_READ) {
+
+ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+ xiovec = file_data->saved_xiovecs;
+ seg_idx < saved_uio_iovcnt; seg_idx++,
+ xiovec++, p_vaddr++) {
+
+ /*
+ * Note that we have to use the copy of the
+ * io vector we made above. uiomove() modifies
+ * the uio and its referenced vector as uiomove
+ * performs the copy, so we can't rely on any
+ * state from the original uio.
+ */
+ memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
+ }
+ }
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+bailout_send_response:
+
+ /*
+ * All I/O is already done, send the response. A lock is not
+ * necessary here because we're single threaded, and therefore the
+ * only context accessing this request right now. If that changes,
+ * we may need some locking here.
+ */
+ xbb_unmap_req(req);
+ xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY :
+ BLKIF_RSP_ERROR);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/error == 0 ? req->nr_512b_sectors << 9
+ : 0,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
+ xbb_release_req(xbb, req);
+
+ return (0);
}
+/*--------------------------- Backend Configuration --------------------------*/
+/**
+ * Close and cleanup any backend device/file specific state for this
+ * block back instance.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
static void
-frontend_changed(struct xenbus_device *xdev,
- XenbusState frontend_state)
+xbb_close_backend(struct xbb_softc *xbb)
{
- blkif_t *blkif = xdev->data;
+ DROP_GIANT();
+ DPRINTF("closing dev=%s\n", xbb->dev_name);
+ if (xbb->vn) {
+ int flags = FREAD;
+ int vfs_is_locked = 0;
- DPRINTF("state=%d\n", frontend_state);
+ if ((xbb->flags & XBBF_READ_ONLY) == 0)
+ flags |= FWRITE;
- blkif->frontend_state = frontend_state;
+ switch (xbb->device_type) {
+ case XBB_TYPE_DISK:
+ if (xbb->backend.dev.csw) {
+ dev_relthread(xbb->backend.dev.cdev,
+ xbb->backend.dev.dev_ref);
+ xbb->backend.dev.csw = NULL;
+ xbb->backend.dev.cdev = NULL;
+ }
+ break;
+ case XBB_TYPE_FILE:
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+ break;
+ case XBB_TYPE_NONE:
+ default:
+ panic("Unexpected backend type.");
+ break;
+ }
- switch (frontend_state) {
- case XenbusStateInitialising:
- break;
- case XenbusStateInitialised:
- case XenbusStateConnected:
- connect_ring(blkif);
- connect(blkif);
- break;
- case XenbusStateClosing:
- xenbus_switch_state(xdev, NULL, XenbusStateClosing);
- break;
- case XenbusStateClosed:
- xenbus_remove_device(xdev);
- break;
- case XenbusStateUnknown:
- case XenbusStateInitWait:
- xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend",
- frontend_state);
- break;
+ (void)vn_close(xbb->vn, flags, NOCRED, curthread);
+ xbb->vn = NULL;
+
+ switch (xbb->device_type) {
+ case XBB_TYPE_DISK:
+ break;
+ case XBB_TYPE_FILE:
+ VFS_UNLOCK_GIANT(vfs_is_locked);
+ if (xbb->backend.file.cred != NULL) {
+ crfree(xbb->backend.file.cred);
+ xbb->backend.file.cred = NULL;
+ }
+ break;
+ case XBB_TYPE_NONE:
+ default:
+ panic("Unexpected backend type.");
+ break;
+ }
}
+ PICKUP_GIANT();
}
-/* ** Driver registration ** */
-
-static struct xenbus_device_id blkback_ids[] = {
- { "vbd" },
- { "" }
-};
+/**
+ * Open a character device to be used for backend I/O.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_open_dev(struct xbb_softc *xbb)
+{
+ struct vattr vattr;
+ struct cdev *dev;
+ struct cdevsw *devsw;
+ int error;
+
+ xbb->device_type = XBB_TYPE_DISK;
+ xbb->dispatch_io = xbb_dispatch_dev;
+ xbb->backend.dev.cdev = xbb->vn->v_rdev;
+ xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
+ &xbb->backend.dev.dev_ref);
+ if (xbb->backend.dev.csw == NULL)
+ panic("Unable to retrieve device switch");
+
+ error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error, "error getting "
+ "vnode attributes for device %s",
+ xbb->dev_name);
+ return (error);
+ }
-static struct xenbus_driver blkback = {
- .name = "blkback",
- .ids = blkback_ids,
- .probe = blkback_probe,
- .remove = blkback_remove,
- .resume = blkback_resume,
- .otherend_changed = frontend_changed,
-};
-static void
-blkback_init(void *unused)
-{
- int i;
-
- TASK_INIT(&blk_req_task, 0, blk_req_action, NULL);
- mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF);
-
- mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF);
-
- mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
- pending_reqs = malloc(sizeof(pending_reqs[0]) *
- blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT);
- pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) *
- mmap_pages, M_DEVBUF, M_NOWAIT);
- pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) *
- mmap_pages, M_DEVBUF, M_NOWAIT);
- mmap_vstart = alloc_empty_page_range(mmap_pages);
- if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) {
- if (pending_reqs)
- free(pending_reqs, M_DEVBUF);
- if (pending_grant_handles)
- free(pending_grant_handles, M_DEVBUF);
- if (pending_vaddrs)
- free(pending_vaddrs, M_DEVBUF);
- WPRINTF("out of memory\n");
- return;
+ dev = xbb->vn->v_rdev;
+ devsw = dev->si_devsw;
+ if (!devsw->d_ioctl) {
+ xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
+ "device %s!", xbb->dev_name);
+ return (ENODEV);
}
- for (i = 0; i < mmap_pages; i++) {
- pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
- pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+ error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
+ (caddr_t)&xbb->sector_size, FREAD,
+ curthread);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling ioctl DIOCGSECTORSIZE "
+ "for device %s", xbb->dev_name);
+ return (error);
}
- for (i = 0; i < blkif_reqs; i++) {
- STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list);
+ error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
+ (caddr_t)&xbb->media_size, FREAD,
+ curthread);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling ioctl DIOCGMEDIASIZE "
+ "for device %s", xbb->dev_name);
+ return (error);
}
- DPRINTF("registering %s\n", blkback.name);
- xenbus_register_backend(&blkback);
+ return (0);
}
-SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL)
-
-static void
-close_device(blkif_t *blkif)
+/**
+ * Open a file to be used for backend I/O.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_open_file(struct xbb_softc *xbb)
{
- DPRINTF("closing dev=%s\n", blkif->dev_name);
- if (blkif->vn) {
- int flags = FREAD;
-
- if (!blkif->read_only)
- flags |= FWRITE;
+ struct xbb_file_data *file_data;
+ struct vattr vattr;
+ int error;
+
+ file_data = &xbb->backend.file;
+ xbb->device_type = XBB_TYPE_FILE;
+ xbb->dispatch_io = xbb_dispatch_file;
+ error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling VOP_GETATTR()"
+ "for file %s", xbb->dev_name);
+ return (error);
+ }
- if (blkif->csw) {
- dev_relthread(blkif->cdev);
- blkif->csw = NULL;
+ /*
+ * Verify that we have the ability to upgrade to exclusive
+ * access on this file so we can trap errors at open instead
+ * of reporting them during first access.
+ */
+ if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
+ vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
+ if (xbb->vn->v_iflag & VI_DOOMED) {
+ error = EBADF;
+ xenbus_dev_fatal(xbb->dev, error,
+ "error locking file %s",
+ xbb->dev_name);
+
+ return (error);
}
+ }
- (void)vn_close(blkif->vn, flags, NOCRED, curthread);
- blkif->vn = NULL;
+ file_data->cred = crhold(curthread->td_ucred);
+ xbb->media_size = vattr.va_size;
+
+ /*
+ * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
+ * With ZFS, it is 131072 bytes. Block sizes that large don't work
+ * with disklabel and UFS on FreeBSD at least. Large block sizes
+ * may not work with other OSes as well. So just export a sector
+ * size of 512 bytes, which should work with any OS or
+ * application. Since our backing is a file, any block size will
+ * work fine for the backing store.
+ */
+#if 0
+ xbb->sector_size = vattr.va_blocksize;
+#endif
+ xbb->sector_size = 512;
+
+ /*
+ * Sanity check. The media size has to be at least one
+ * sector long.
+ */
+ if (xbb->media_size < xbb->sector_size) {
+ error = EINVAL;
+ xenbus_dev_fatal(xbb->dev, error,
+ "file %s size %ju < block size %u",
+ xbb->dev_name,
+ (uintmax_t)xbb->media_size,
+ xbb->sector_size);
}
+ return (error);
}
+/**
+ * Open the backend provider for this connection.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-open_device(blkif_t *blkif)
+xbb_open_backend(struct xbb_softc *xbb)
{
struct nameidata nd;
- struct vattr vattr;
- struct cdev *dev;
- struct cdevsw *devsw;
- int flags = FREAD, err = 0;
+ int flags;
+ int error;
+ int vfs_is_locked;
- DPRINTF("opening dev=%s\n", blkif->dev_name);
+ flags = FREAD;
+ error = 0;
- if (!blkif->read_only)
+ DPRINTF("opening dev=%s\n", xbb->dev_name);
+
+ if ((xbb->flags & XBBF_READ_ONLY) == 0)
flags |= FWRITE;
if (!curthread->td_proc->p_fd->fd_cdir) {
@@ -1066,284 +1930,1045 @@ open_device(blkif_t *blkif)
}
again:
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread);
- err = vn_open(&nd, &flags, 0, -1);
- if (err) {
- if (blkif->dev_name[0] != '/') {
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error) {
+ /*
+ * This is the only reasonable guess we can make as far as
+ * path if the user doesn't give us a fully qualified path.
+ * If they want to specify a file, they need to specify the
+ * full path.
+ */
+ if (xbb->dev_name[0] != '/') {
char *dev_path = "/dev/";
char *dev_name;
/* Try adding device path at beginning of name */
- dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT);
+ dev_name = malloc(strlen(xbb->dev_name)
+ + strlen(dev_path) + 1,
+ M_XENBLOCKBACK, M_NOWAIT);
if (dev_name) {
- sprintf(dev_name, "%s%s", dev_path, blkif->dev_name);
- free(blkif->dev_name, M_DEVBUF);
- blkif->dev_name = dev_name;
+ sprintf(dev_name, "%s%s", dev_path,
+ xbb->dev_name);
+ free(xbb->dev_name, M_XENBLOCKBACK);
+ xbb->dev_name = dev_name;
goto again;
}
}
- xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name);
- return err;
+ xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
+ xbb->dev_name);
+ return (error);
}
+
+ vfs_is_locked = NDHASGIANT(&nd);
+
NDFREE(&nd, NDF_ONLY_PNBUF);
- blkif->vn = nd.ni_vp;
+ xbb->vn = nd.ni_vp;
+
+ /* We only support disks and files. */
+ if (vn_isdisk(xbb->vn, &error)) {
+ error = xbb_open_dev(xbb);
+ } else if (xbb->vn->v_type == VREG) {
+ error = xbb_open_file(xbb);
+ } else {
+ error = EINVAL;
+ xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
+ "or file", xbb->dev_name);
+ }
+ VOP_UNLOCK(xbb->vn, 0);
+ VFS_UNLOCK_GIANT(vfs_is_locked);
- /* We only support disks for now */
- if (!vn_isdisk(blkif->vn, &err)) {
- xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name);
- VOP_UNLOCK(blkif->vn, 0, curthread);
- goto error;
+ if (error != 0) {
+ xbb_close_backend(xbb);
+ return (error);
}
- blkif->cdev = blkif->vn->v_rdev;
- blkif->csw = dev_refthread(blkif->cdev);
- PANIC_IF(blkif->csw == NULL);
+ xbb->sector_size_shift = fls(xbb->sector_size) - 1;
+ xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
+
+ DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
+ (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
+ xbb->dev_name, xbb->sector_size, xbb->media_size);
+
+ return (0);
+}
- err = VOP_GETATTR(blkif->vn, &vattr, NOCRED);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error getting vnode attributes for device %s", blkif->dev_name);
- VOP_UNLOCK(blkif->vn, 0, curthread);
- goto error;
+/*------------------------ Inter-Domain Communication ------------------------*/
+/**
+ * Cleanup all inter-domain communication mechanisms.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_disconnect(struct xbb_softc *xbb)
+{
+ struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES];
+ struct gnttab_unmap_grant_ref *op;
+ u_int ring_idx;
+ int error;
+
+ DPRINTF("\n");
+
+ if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
+ return;
+
+ if (xbb->irq != 0) {
+ unbind_from_irqhandler(xbb->irq);
+ xbb->irq = 0;
}
- VOP_UNLOCK(blkif->vn, 0, curthread);
+ for (ring_idx = 0, op = ops;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, op++) {
- dev = blkif->vn->v_rdev;
- devsw = dev->si_devsw;
- if (!devsw->d_ioctl) {
- err = ENODEV;
- xenbus_dev_fatal(blkif->xdev, err,
- "no d_ioctl for device %s!", blkif->dev_name);
- goto error;
+ op->host_addr = xbb->ring_config.gnt_addr
+ + (ring_idx * PAGE_SIZE);
+ op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
+ op->handle = xbb->ring_config.handle[ring_idx];
}
- err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name);
- goto error;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
+ xbb->ring_config.ring_pages);
+ if (error != 0)
+ panic("Grant table op failed (%d)", error);
+
+ xbb->flags &= ~XBBF_RING_CONNECTED;
+}
+
+/**
+ * Map shared memory ring into domain local address space, initialize
+ * ring control structures, and bind an interrupt to the event channel
+ * used to notify us of ring changes.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_connect_ring(struct xbb_softc *xbb)
+{
+ struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES];
+ struct gnttab_map_grant_ref *gnt;
+ u_int ring_idx;
+ int error;
+
+ if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
+ return (0);
+
+ /*
+ * Kva for our ring is at the tail of the region of kva allocated
+ * by xbb_alloc_communication_mem().
+ */
+ xbb->ring_config.va = xbb->kva
+ + (xbb->kva_size
+ - (xbb->ring_config.ring_pages * PAGE_SIZE));
+ xbb->ring_config.gnt_addr = xbb->gnt_base_addr
+ + (xbb->kva_size
+ - (xbb->ring_config.ring_pages * PAGE_SIZE));
+
+ for (ring_idx = 0, gnt = gnts;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, gnt++) {
+
+ gnt->host_addr = xbb->ring_config.gnt_addr
+ + (ring_idx * PAGE_SIZE);
+ gnt->flags = GNTMAP_host_map;
+ gnt->ref = xbb->ring_config.ring_ref[ring_idx];
+ gnt->dom = xbb->otherend_id;
}
- blkif->sector_size_shift = fls(blkif->sector_size) - 1;
- err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name);
- goto error;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
+ xbb->ring_config.ring_pages);
+ if (error)
+ panic("blkback: Ring page grant table op failed (%d)", error);
+
+ for (ring_idx = 0, gnt = gnts;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, gnt++) {
+ if (gnt->status != 0) {
+ xbb->ring_config.va = 0;
+ xenbus_dev_fatal(xbb->dev, EACCES,
+ "Ring shared page mapping failed. "
+ "Status %d.", gnt->status);
+ return (EACCES);
+ }
+ xbb->ring_config.handle[ring_idx] = gnt->handle;
+ xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
}
- blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift;
- blkif->major = major(vattr.va_rdev);
- blkif->minor = minor(vattr.va_rdev);
+ /* Initialize the ring based on ABI. */
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ blkif_sring_t *sring;
+ sring = (blkif_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.native, sring,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ blkif_x86_32_sring_t *sring_x86_32;
+ sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ blkif_x86_64_sring_t *sring_x86_64;
+ sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ }
- DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n",
- blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size);
+ xbb->flags |= XBBF_RING_CONNECTED;
+
+ error =
+ bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id,
+ xbb->ring_config.evtchn,
+ device_get_nameunit(xbb->dev),
+ xbb_intr, /*arg*/xbb,
+ INTR_TYPE_BIO | INTR_MPSAFE,
+ &xbb->irq);
+ if (error) {
+ xbb_disconnect(xbb);
+ xenbus_dev_fatal(xbb->dev, error, "binding event channel");
+ return (error);
+ }
- return 0;
+ DPRINTF("rings connected!\n");
- error:
- close_device(blkif);
- return err;
+ return 0;
}
+/**
+ * Size KVA and pseudo-physical address allocations based on negotiated
+ * values for the size and number of I/O requests, and the size of our
+ * communication ring.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * These address spaces are used to dynamically map pages in the
+ * front-end's domain into our own.
+ */
static int
-vbd_add_dev(struct xenbus_device *xdev)
+xbb_alloc_communication_mem(struct xbb_softc *xbb)
{
- blkif_t *blkif = xdev->data;
- device_t nexus, ndev;
- devclass_t dc;
- int err = 0;
+ xbb->kva_size = (xbb->ring_config.ring_pages
+ + (xbb->max_requests * xbb->max_request_segments))
+ * PAGE_SIZE;
+#ifndef XENHVM
+ xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size);
+ if (xbb->kva == 0)
+ return (ENOMEM);
+ xbb->gnt_base_addr = xbb->kva;
+#else /* XENHVM */
+ /*
+ * Reserve a range of pseudo physical memory that we can map
+ * into kva. These pages will only be backed by machine
+ * pages ("real memory") during the lifetime of front-end requests
+ * via grant table operations.
+ */
+ xbb->pseudo_phys_res_id = 0;
+ xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
+ &xbb->pseudo_phys_res_id,
+ 0, ~0, xbb->kva_size,
+ RF_ACTIVE);
+ if (xbb->pseudo_phys_res == NULL) {
+ xbb->kva = 0;
+ return (ENOMEM);
+ }
+ xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
+ xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
+#endif /* XENHVM */
+ return (0);
+}
- mtx_lock(&Giant);
+/**
+ * Free dynamically allocated KVA or pseudo-physical address allocations.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_free_communication_mem(struct xbb_softc *xbb)
+{
+ if (xbb->kva != 0) {
+#ifndef XENHVM
+ kmem_free(kernel_map, xbb->kva, xbb->kva_size);
+#else
+ if (xbb->pseudo_phys_res != NULL) {
+ bus_release_resource(xbb->dev, SYS_RES_MEMORY,
+ xbb->pseudo_phys_res_id,
+ xbb->pseudo_phys_res);
+ xbb->pseudo_phys_res = NULL;
+ }
+#endif
+ }
+ xbb->kva = 0;
+ xbb->gnt_base_addr = 0;
+}
- /* We will add a vbd device as a child of nexus0 (for now) */
- if (!(dc = devclass_find("nexus")) ||
- !(nexus = devclass_get_device(dc, 0))) {
- WPRINTF("could not find nexus0!\n");
- err = ENOENT;
- goto done;
+/**
+ * Collect front-end information from the XenStore.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_collect_frontend_info(struct xbb_softc *xbb)
+{
+ char protocol_abi[64];
+ const char *otherend_path;
+ int error;
+ u_int ring_idx;
+
+ otherend_path = xenbus_get_otherend_path(xbb->dev);
+
+ /*
+ * Mandatory data (used in all versions of the protocol) first.
+ */
+ error = xs_gather(XST_NIL, otherend_path,
+ "ring-ref", "%" PRIu32,
+ &xbb->ring_config.ring_ref[0],
+ "event-channel", "%" PRIu32,
+ &xbb->ring_config.evtchn,
+ NULL);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Unable to retrieve ring information from "
+ "frontend %s. Unable to connect.",
+ xenbus_get_otherend_path(xbb->dev));
+ return (error);
}
+ /*
+ * These fields are initialized to legacy protocol defaults
+ * so we only need to fail if reading the updated value succeeds
+ * and the new value is outside of its allowed range.
+ *
+ * \note xs_gather() returns on the first encountered error, so
+ * we must use independant calls in order to guarantee
+ * we don't miss information in a sparsly populated front-end
+ * tree.
+ */
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "ring-pages", NULL, "%" PRIu32,
+ &xbb->ring_config.ring_pages);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-requests", NULL, "%" PRIu32,
+ &xbb->max_requests);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-segments", NULL, "%" PRIu32,
+ &xbb->max_request_segments);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-size", NULL, "%" PRIu32,
+ &xbb->max_request_size);
+
+ if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed ring-pages of %u "
+ "exceeds backend limit of %zu. "
+ "Unable to connect.",
+ xbb->ring_config.ring_pages,
+ XBB_MAX_RING_PAGES);
+ return (EINVAL);
+ } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_requests of %u "
+ "exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_requests,
+ XBB_MAX_REQUESTS);
+ return (EINVAL);
+ } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_requests_segments "
+ "of %u exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_request_segments,
+ XBB_MAX_SEGMENTS_PER_REQUEST);
+ return (EINVAL);
+ } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_request_size "
+ "of %u exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_request_size,
+ XBB_MAX_REQUEST_SIZE);
+ return (EINVAL);
+ }
- /* Create a newbus device representing the vbd */
- ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle);
- if (!ndev) {
- WPRINTF("could not create newbus device vbd%d!\n", blkif->handle);
- err = EFAULT;
- goto done;
+ /* If using a multi-page ring, pull in the remaining references. */
+ for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name),
+ "ring-ref%u", ring_idx);
+ error = xs_scanf(XST_NIL, otherend_path,
+ ring_ref_name, NULL, "%" PRIu32,
+ &xbb->ring_config.ring_ref[ring_idx]);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Failed to retriev grant reference "
+ "for page %u of shared ring. Unable "
+ "to connect.", ring_idx);
+ return (error);
+ }
}
-
- blkif_get(blkif);
- device_set_ivars(ndev, blkif);
- blkif->ndev = ndev;
- device_probe_and_attach(ndev);
+ error = xs_gather(XST_NIL, otherend_path,
+ "protocol", "%63s", protocol_abi,
+ NULL);
+ if (error != 0
+ || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
+ /*
+ * Assume native if the frontend has not
+ * published ABI data or it has published and
+ * matches our own ABI.
+ */
+ xbb->abi = BLKIF_PROTOCOL_NATIVE;
+ } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
- done:
+ xbb->abi = BLKIF_PROTOCOL_X86_32;
+ } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
- mtx_unlock(&Giant);
+ xbb->abi = BLKIF_PROTOCOL_X86_64;
+ } else {
- return err;
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Unknown protocol ABI (%s) published by "
+ "frontend. Unable to connect.", protocol_abi);
+ return (EINVAL);
+ }
+ return (0);
}
-enum {
- VBD_SYSCTL_DOMID,
- VBD_SYSCTL_ST_RD_REQ,
- VBD_SYSCTL_ST_WR_REQ,
- VBD_SYSCTL_ST_OO_REQ,
- VBD_SYSCTL_ST_ERR_REQ,
- VBD_SYSCTL_RING,
-};
-
-static char *
-vbd_sysctl_ring_info(blkif_t *blkif, int cmd)
+/**
+ * Allocate per-request data structures given request size and number
+ * information negotiated with the front-end.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_alloc_requests(struct xbb_softc *xbb)
{
- char *buf = malloc(256, M_DEVBUF, M_WAITOK);
- if (buf) {
- if (!blkif->ring_connected)
- sprintf(buf, "ring not connected\n");
- else {
- blkif_back_ring_t *ring = &blkif->ring;
- sprintf(buf, "nr_ents=%x req_cons=%x"
- " req_prod=%x req_event=%x"
- " rsp_prod=%x rsp_event=%x",
- ring->nr_ents, ring->req_cons,
- ring->sring->req_prod, ring->sring->req_event,
- ring->sring->rsp_prod, ring->sring->rsp_event);
+ struct xbb_xen_req *req;
+ struct xbb_xen_req *last_req;
+ uint8_t *req_kva;
+ u_long gnt_base;
+
+ /*
+ * Allocate request book keeping datastructures.
+ */
+ xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
+ M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+ if (xbb->requests == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request structures");
+ return (ENOMEM);
+ }
+
+ req_kva = (uint8_t *)xbb->kva;
+ gnt_base = xbb->gnt_base_addr;
+ req = xbb->requests;
+ last_req = &xbb->requests[xbb->max_requests - 1];
+ while (req <= last_req) {
+ int seg;
+
+ req->xbb = xbb;
+ req->kva = req_kva;
+ req->gnt_handles = malloc(xbb->max_request_segments
+ * sizeof(*req->gnt_handles),
+ M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+ if (req->gnt_handles == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request "
+ "grant references");
+ return (ENOMEM);
+ }
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ req->bounce = malloc(xbb->max_request_size,
+ M_XENBLOCKBACK, M_NOWAIT);
+ if (req->bounce == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request "
+ "bounce buffers");
+ return (ENOMEM);
}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ req->gnt_base = gnt_base;
+ req_kva += xbb->max_request_segments * PAGE_SIZE;
+ gnt_base += xbb->max_request_segments * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+
+ for (seg = 0; seg < xbb->max_request_segments; seg++)
+ req->gnt_handles[seg] = GRANT_REF_INVALID;
+
+ req++;
}
- return buf;
+ return (0);
}
+/**
+ * Supply information about the physical device to the frontend
+ * via XenBus.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
static int
-vbd_sysctl_handler(SYSCTL_HANDLER_ARGS)
+xbb_publish_backend_info(struct xbb_softc *xbb)
{
- device_t dev = (device_t)arg1;
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
- const char *value;
- char *buf = NULL;
- int err;
-
- switch (arg2) {
- case VBD_SYSCTL_DOMID:
- return sysctl_handle_int(oidp, NULL, blkif->domid, req);
- case VBD_SYSCTL_ST_RD_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req);
- case VBD_SYSCTL_ST_WR_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req);
- case VBD_SYSCTL_ST_OO_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req);
- case VBD_SYSCTL_ST_ERR_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req);
- case VBD_SYSCTL_RING:
- value = buf = vbd_sysctl_ring_info(blkif, arg2);
- break;
- default:
- return (EINVAL);
+ struct xs_transaction xst;
+ const char *our_path;
+ const char *leaf;
+ int error;
+
+ our_path = xenbus_get_node(xbb->dev);
+ while (1) {
+ error = xs_transaction_start(&xst);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Error publishing backend info "
+ "(start transaction)");
+ return (error);
+ }
+
+ leaf = "sectors";
+ error = xs_printf(xst, our_path, leaf,
+ "%"PRIu64, xbb->media_num_sectors);
+ if (error != 0)
+ break;
+
+ /* XXX Support all VBD attributes here. */
+ leaf = "info";
+ error = xs_printf(xst, our_path, leaf, "%u",
+ xbb->flags & XBBF_READ_ONLY
+ ? VDISK_READONLY : 0);
+ if (error != 0)
+ break;
+
+ leaf = "sector-size";
+ error = xs_printf(xst, our_path, leaf, "%u",
+ xbb->sector_size);
+ if (error != 0)
+ break;
+
+ error = xs_transaction_end(xst, 0);
+ if (error == 0) {
+ return (0);
+ } else if (error != EAGAIN) {
+ xenbus_dev_fatal(xbb->dev, error, "ending transaction");
+ return (error);
+ }
}
- err = SYSCTL_OUT(req, value, strlen(value));
- if (buf != NULL)
- free(buf, M_DEVBUF);
+ xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
+ our_path, leaf);
+ xs_transaction_end(xst, 1);
+ return (error);
+}
+
+/**
+ * Connect to our blkfront peer now that it has completed publishing
+ * its configuration into the XenStore.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_connect(struct xbb_softc *xbb)
+{
+ int error;
+
+ if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
+ return;
+
+ if (xbb_collect_frontend_info(xbb) != 0)
+ return;
- return err;
+ /* Allocate resources whose size depends on front-end configuration. */
+ error = xbb_alloc_communication_mem(xbb);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Unable to allocate communication memory");
+ return;
+ }
+
+ error = xbb_alloc_requests(xbb);
+ if (error != 0) {
+ /* Specific errors are reported by xbb_alloc_requests(). */
+ return;
+ }
+
+ /*
+ * Connect communication channel.
+ */
+ error = xbb_connect_ring(xbb);
+ if (error != 0) {
+ /* Specific errors are reported by xbb_connect_ring(). */
+ return;
+ }
+
+ if (xbb_publish_backend_info(xbb) != 0) {
+ /*
+ * If we can't publish our data, we cannot participate
+ * in this connection, and waiting for a front-end state
+ * change will not help the situation.
+ */
+ xbb_disconnect(xbb);
+ return;
+ }
+
+ /* Ready for I/O. */
+ xenbus_set_state(xbb->dev, XenbusStateConnected);
}
-/* Newbus vbd device driver probe */
+/*-------------------------- Device Teardown Support -------------------------*/
+/**
+ * Perform device shutdown functions.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * Mark this instance as shutting down, wait for any active I/O on the
+ * backend device/file to drain, disconnect from the front-end, and notify
+ * any waiters (e.g. a thread invoking our detach method) that detach can
+ * now proceed.
+ */
static int
-vbd_probe(device_t dev)
+xbb_shutdown(struct xbb_softc *xbb)
{
- DPRINTF("vbd%d\n", device_get_unit(dev));
- return 0;
+ static int in_shutdown;
+
+ DPRINTF("\n");
+
+ /*
+ * Due to the need to drop our mutex during some
+ * xenbus operations, it is possible for two threads
+ * to attempt to close out shutdown processing at
+ * the same time. Tell the caller that hits this
+ * race to try back later.
+ */
+ if (in_shutdown != 0)
+ return (EAGAIN);
+
+ DPRINTF("\n");
+
+ /* Indicate shutdown is in progress. */
+ xbb->flags |= XBBF_SHUTDOWN;
+
+ /* Wait for requests to complete. */
+ if (xbb->active_request_count != 0)
+ return (EAGAIN);
+
+ DPRINTF("\n");
+
+ /* Disconnect from the front-end. */
+ xbb_disconnect(xbb);
+
+ in_shutdown = 1;
+ mtx_unlock(&xbb->lock);
+ xenbus_set_state(xbb->dev, XenbusStateClosed);
+ mtx_lock(&xbb->lock);
+ in_shutdown = 0;
+
+ /* Indicate to xbb_detach() that is it safe to proceed. */
+ wakeup(xbb);
+
+ return (0);
+}
+
+/**
+ * Report an attach time error to the console and Xen, and cleanup
+ * this instance by forcing immediate detach processing.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param err Errno describing the error.
+ * \param fmt Printf style format and arguments
+ */
+static void
+xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
+{
+ va_list ap;
+ va_list ap_hotplug;
+
+ va_start(ap, fmt);
+ va_copy(ap_hotplug, ap);
+ xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-error", fmt, ap_hotplug);
+ va_end(ap_hotplug);
+ xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-status", "error");
+
+ xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
+ va_end(ap);
+
+ xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "online", "0");
+ xbb_detach(xbb->dev);
}
-/* Newbus vbd device driver attach */
+/*---------------------------- NewBus Entrypoints ----------------------------*/
+/**
+ * Inspect a XenBus device and claim it if is of the appropriate type.
+ *
+ * \param dev NewBus device object representing a candidate XenBus device.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-vbd_attach(device_t dev)
+xbb_probe(device_t dev)
{
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
-
- DPRINTF("%s\n", blkif->dev_name);
-
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I",
- "domid of frontend");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I",
- "number of read reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I",
- "number of write reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I",
- "number of deferred reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I",
- "number of reqs that returned error");
-#if XEN_BLKBACK_DEBUG
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "ring", CTLFLAG_RD,
- dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A",
- "req ring info");
-#endif
+
+ if (!strcmp(xenbus_get_type(dev), "vbd")) {
+ device_set_desc(dev, "Backend Virtual Block Device");
+ device_quiet(dev);
+ return (0);
+ }
+
+ return (ENXIO);
+}
- if (!open_device(blkif))
- connect(blkif);
+/**
+ * Attach to a XenBus device that has been claimed by our probe routine.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_attach(device_t dev)
+{
+ struct xbb_softc *xbb;
+ int error;
+
+ DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
+
+ /*
+ * Basic initialization.
+ * After this block it is safe to call xbb_detach()
+ * to clean up any allocated data for this instance.
+ */
+ xbb = device_get_softc(dev);
+ xbb->dev = dev;
+ xbb->otherend_id = xenbus_get_otherend_id(dev);
+ TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
+ mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
+ SLIST_INIT(&xbb->request_free_slist);
+
+ /*
+ * Protocol defaults valid even if all negotiation fails.
+ */
+ xbb->ring_config.ring_pages = 1;
+ xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
+ xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+ xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE;
+
+ /*
+ * Publish protocol capabilities for consumption by the
+ * front-end.
+ */
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "feature-barrier", "1");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "feature-flush-cache", "1");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-requests", "%u", XBB_MAX_REQUESTS);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-requests",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-request-segments", "%u",
+ XBB_MAX_SEGMENTS_PER_REQUEST);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-request-size", "%u",
+ XBB_MAX_REQUEST_SIZE);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-request-size",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ /* Collect physical device information. */
+ error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
+ "device-type", NULL, &xbb->dev_type,
+ NULL);
+ if (error != 0)
+ xbb->dev_type = NULL;
+
+ error = xs_gather(XST_NIL, xenbus_get_node(dev),
+ "mode", NULL, &xbb->dev_mode,
+ "params", NULL, &xbb->dev_name,
+ NULL);
+ if (error != 0) {
+ xbb_attach_failed(xbb, error, "reading backend fields at %s",
+ xenbus_get_node(dev));
+ return (ENXIO);
+ }
+
+ /* Parse fopen style mode flags. */
+ if (strchr(xbb->dev_mode, 'w') == NULL)
+ xbb->flags |= XBBF_READ_ONLY;
+
+ /*
+ * Verify the physical device is present and can support
+ * the desired I/O mode.
+ */
+ DROP_GIANT();
+ error = xbb_open_backend(xbb);
+ PICKUP_GIANT();
+ if (error != 0) {
+ xbb_attach_failed(xbb, error, "Unable to open %s",
+ xbb->dev_name);
+ return (ENXIO);
+ }
- return bus_generic_attach(dev);
+ /* Use devstat(9) for recording statistics. */
+ xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
+ xbb->sector_size,
+ DEVSTAT_ALL_SUPPORTED,
+ DEVSTAT_TYPE_DIRECT
+ | DEVSTAT_TYPE_IF_OTHER,
+ DEVSTAT_PRIORITY_OTHER);
+ /*
+ * Create a taskqueue for doing work that must occur from a
+ * thread context.
+ */
+ xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT,
+ taskqueue_thread_enqueue,
+ /*context*/&xbb->io_taskqueue);
+ if (xbb->io_taskqueue == NULL) {
+ xbb_attach_failed(xbb, error, "Unable to create taskqueue");
+ return (ENOMEM);
+ }
+
+ taskqueue_start_threads(&xbb->io_taskqueue,
+ /*num threads*/1,
+ /*priority*/PWAIT,
+ /*thread name*/
+ "%s taskq", device_get_nameunit(dev));
+
+ /* Update hot-plug status to satisfy xend. */
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-status", "connected");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ /* Tell the front end that we are ready to connect. */
+ xenbus_set_state(dev, XenbusStateInitWait);
+
+ return (0);
}
-/* Newbus vbd device driver detach */
+/**
+ * Detach from a block back device instanced.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ *
+ * \note A block back device may be detached at any time in its life-cycle,
+ * including part way through the attach process. For this reason,
+ * initialization order and the intialization state checks in this
+ * routine must be carefully coupled so that attach time failures
+ * are gracefully handled.
+ */
static int
-vbd_detach(device_t dev)
+xbb_detach(device_t dev)
{
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
+ struct xbb_softc *xbb;
- DPRINTF("%s\n", blkif->dev_name);
+ DPRINTF("\n");
- close_device(blkif);
+ xbb = device_get_softc(dev);
+ mtx_lock(&xbb->lock);
+ while (xbb_shutdown(xbb) == EAGAIN) {
+ msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
+ "xbb_shutdown", 0);
+ }
+ mtx_unlock(&xbb->lock);
+ mtx_destroy(&xbb->lock);
- bus_generic_detach(dev);
+ DPRINTF("\n");
- blkif_put(blkif);
+ taskqueue_free(xbb->io_taskqueue);
+ devstat_remove_entry(xbb->xbb_stats);
- return 0;
+ xbb_close_backend(xbb);
+ xbb_free_communication_mem(xbb);
+
+ if (xbb->dev_mode != NULL) {
+ free(xbb->dev_mode, M_XENBUS);
+ xbb->dev_mode = NULL;
+ }
+
+ if (xbb->dev_type != NULL) {
+ free(xbb->dev_type, M_XENBUS);
+ xbb->dev_type = NULL;
+ }
+
+ if (xbb->dev_name != NULL) {
+ free(xbb->dev_name, M_XENBUS);
+ xbb->dev_name = NULL;
+ }
+
+ if (xbb->requests != NULL) {
+ struct xbb_xen_req *req;
+ struct xbb_xen_req *last_req;
+
+ req = xbb->requests;
+ last_req = &xbb->requests[xbb->max_requests - 1];
+ while (req <= last_req) {
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ if (req->bounce != NULL) {
+ free(req->bounce, M_XENBLOCKBACK);
+ req->bounce = NULL;
+ }
+#endif
+ if (req->gnt_handles != NULL) {
+ free (req->gnt_handles, M_XENBLOCKBACK);
+ req->gnt_handles = NULL;
+ }
+ req++;
+ }
+ free(xbb->requests, M_XENBLOCKBACK);
+ xbb->requests = NULL;
+ }
+
+ return (0);
}
-static device_method_t vbd_methods[] = {
+/**
+ * Prepare this block back device for suspension of this VM.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_suspend(device_t dev)
+{
+#ifdef NOT_YET
+ struct xbb_softc *sc = device_get_softc(dev);
+
+ /* Prevent new requests being issued until we fix things up. */
+ mtx_lock(&sc->xb_io_lock);
+ sc->connected = BLKIF_STATE_SUSPENDED;
+ mtx_unlock(&sc->xb_io_lock);
+#endif
+
+ return (0);
+}
+
+/**
+ * Perform any processing required to recover from a suspended state.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_resume(device_t dev)
+{
+ return (0);
+}
+
+/**
+ * Handle state changes expressed via the XenStore by our front-end peer.
+ *
+ * \param dev NewBus device object representing this Xen
+ * Block Back instance.
+ * \param frontend_state The new state of the front-end.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_frontend_changed(device_t dev, XenbusState frontend_state)
+{
+ struct xbb_softc *xbb = device_get_softc(dev);
+
+ DPRINTF("state=%s\n", xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateClosing:
+ break;
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ xbb_connect(xbb);
+ break;
+ case XenbusStateClosed:
+ case XenbusStateInitWait:
+
+ mtx_lock(&xbb->lock);
+ xbb_shutdown(xbb);
+ mtx_unlock(&xbb->lock);
+ break;
+ default:
+ xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+ return (0);
+}
+
+/*---------------------------- NewBus Registration ---------------------------*/
+static device_method_t xbb_methods[] = {
/* Device interface */
- DEVMETHOD(device_probe, vbd_probe),
- DEVMETHOD(device_attach, vbd_attach),
- DEVMETHOD(device_detach, vbd_detach),
+ DEVMETHOD(device_probe, xbb_probe),
+ DEVMETHOD(device_attach, xbb_attach),
+ DEVMETHOD(device_detach, xbb_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
- DEVMETHOD(device_suspend, bus_generic_suspend),
- DEVMETHOD(device_resume, bus_generic_resume),
- {0, 0}
-};
+ DEVMETHOD(device_suspend, xbb_suspend),
+ DEVMETHOD(device_resume, xbb_resume),
-static devclass_t vbd_devclass;
+ /* Xenbus interface */
+ DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
-static driver_t vbd_driver = {
- "vbd",
- vbd_methods,
- 0,
+ { 0, 0 }
};
-DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0);
+static driver_t xbb_driver = {
+ "xbbd",
+ xbb_methods,
+ sizeof(struct xbb_softc),
+};
+devclass_t xbb_devclass;
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: t
- * End:
- */
+DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
index 6c222ea..8ff8757 100644
--- a/sys/dev/xen/blkfront/blkfront.c
+++ b/sys/dev/xen/blkfront/blkfront.c
@@ -49,8 +49,10 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
#include <sys/bus_dma.h>
+#include <machine/_inttypes.h>
#include <machine/xen/xen-os.h>
#include <machine/xen/xenfunc.h>
+
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <xen/evtchn.h>
@@ -68,17 +70,21 @@ __FBSDID("$FreeBSD$");
/* prototypes */
static void xb_free_command(struct xb_command *cm);
static void xb_startio(struct xb_softc *sc);
-static void connect(struct xb_softc *);
+static void blkfront_connect(struct xb_softc *);
static void blkfront_closing(device_t);
static int blkfront_detach(device_t);
-static int talk_to_backend(struct xb_softc *);
static int setup_blkring(struct xb_softc *);
static void blkif_int(void *);
+static void blkfront_initialize(struct xb_softc *);
+#if 0
static void blkif_recover(struct xb_softc *);
-static void blkif_completion(struct xb_command *);
+#endif
+static int blkif_completion(struct xb_command *);
static void blkif_free(struct xb_softc *, int);
static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int);
+MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
+
#define GRANT_INVALID_REF 0
/* Control whether runtime update of vbds is enabled. */
@@ -113,11 +119,6 @@ static char * blkif_status_name[] = {
#define DPRINTK(fmt, args...)
#endif
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
-
-#define BLKIF_MAXIO (32 * 1024)
-
static int blkif_open(struct disk *dp);
static int blkif_close(struct disk *dp);
static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
@@ -202,8 +203,8 @@ blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
}
int
-xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
- int vdevice, uint16_t vdisk_info, uint16_t sector_size)
+xlvbd_add(struct xb_softc *sc, blkif_sector_t sectors,
+ int vdevice, uint16_t vdisk_info, unsigned long sector_size)
{
int unit, error = 0;
const char *name;
@@ -215,7 +216,6 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
if (strcmp(name, "xbd"))
device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit);
- memset(&sc->xb_disk, 0, sizeof(sc->xb_disk));
sc->xb_disk = disk_alloc();
sc->xb_disk->d_unit = sc->xb_unit;
sc->xb_disk->d_open = blkif_open;
@@ -227,20 +227,14 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
sc->xb_disk->d_drv1 = sc;
sc->xb_disk->d_sectorsize = sector_size;
- sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
- sc->xb_disk->d_maxsize = BLKIF_MAXIO;
+ sc->xb_disk->d_mediasize = sectors * sector_size;
+ sc->xb_disk->d_maxsize = sc->max_request_size;
sc->xb_disk->d_flags = 0;
disk_create(sc->xb_disk, DISK_VERSION_00);
return error;
}
-void
-xlvbd_del(struct xb_softc *sc)
-{
-
- disk_destroy(sc->xb_disk);
-}
/************************ end VBD support *****************/
/*
@@ -357,15 +351,16 @@ xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
return (EBUSY);
}
- if (gnttab_alloc_grant_references(
- BLKIF_MAX_SEGMENTS_PER_REQUEST, &cm->gref_head) < 0) {
+ if (gnttab_alloc_grant_references(sc->max_request_segments,
+ &cm->gref_head) != 0) {
xb_free_command(cm);
mtx_unlock(&sc->xb_io_lock);
device_printf(sc->xb_dev, "no more grant allocs?\n");
return (EBUSY);
}
- chunk = length > BLKIF_MAXIO ? BLKIF_MAXIO : length;
+ chunk = length > sc->max_request_size
+ ? sc->max_request_size : length;
cm->data = virtual;
cm->datalen = chunk;
cm->operation = BLKIF_OP_WRITE;
@@ -423,16 +418,18 @@ static int
blkfront_attach(device_t dev)
{
struct xb_softc *sc;
- struct xb_command *cm;
const char *name;
- int error, vdevice, i, unit;
+ int error;
+ int vdevice;
+ int i;
+ int unit;
/* FIXME: Use dynamic device id if this is not set. */
- error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
+ error = xs_scanf(XST_NIL, xenbus_get_node(dev),
"virtual-device", NULL, "%i", &vdevice);
if (error) {
xenbus_dev_fatal(dev, error, "reading virtual-device");
- printf("couldn't find virtual device");
+ device_printf(dev, "Couldn't determine virtual device.\n");
return (error);
}
@@ -447,51 +444,18 @@ blkfront_attach(device_t dev)
xb_initq_ready(sc);
xb_initq_complete(sc);
xb_initq_bio(sc);
-
- /* Allocate parent DMA tag */
- if (bus_dma_tag_create( NULL, /* parent */
- 512, 4096, /* algnmnt, boundary */
- BUS_SPACE_MAXADDR, /* lowaddr */
- BUS_SPACE_MAXADDR, /* highaddr */
- NULL, NULL, /* filter, filterarg */
- BLKIF_MAXIO, /* maxsize */
- BLKIF_MAX_SEGMENTS_PER_REQUEST, /* nsegments */
- PAGE_SIZE, /* maxsegsize */
- BUS_DMA_ALLOCNOW, /* flags */
- busdma_lock_mutex, /* lockfunc */
- &sc->xb_io_lock, /* lockarg */
- &sc->xb_io_dmat)) {
- device_printf(dev, "Cannot allocate parent DMA tag\n");
- return (ENOMEM);
- }
-#ifdef notyet
- if (bus_dma_tag_set(sc->xb_io_dmat, BUS_DMA_SET_MINSEGSZ,
- XBD_SECTOR_SIZE)) {
- device_printf(dev, "Cannot set sector size\n");
- return (EINVAL);
- }
-#endif
+ for (i = 0; i < XBF_MAX_RING_PAGES; i++)
+ sc->ring_ref[i] = GRANT_INVALID_REF;
sc->xb_dev = dev;
sc->vdevice = vdevice;
sc->connected = BLKIF_STATE_DISCONNECTED;
- /* work queue needed ? */
- for (i = 0; i < BLK_RING_SIZE; i++) {
- cm = &sc->shadow[i];
- cm->req.id = i;
- cm->cm_sc = sc;
- if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
- break;
- xb_free_command(cm);
- }
-
/* Front end dir is a number, which is used as the id. */
sc->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
- error = talk_to_backend(sc);
- if (error)
- return (error);
+ /* Wait for backend device to publish its protocol capabilities. */
+ xenbus_set_state(dev, XenbusStateInitialising);
return (0);
}
@@ -512,121 +476,265 @@ blkfront_suspend(device_t dev)
static int
blkfront_resume(device_t dev)
{
+#if 0
struct xb_softc *sc = device_get_softc(dev);
- int err;
DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
+/* XXX This can't work!!! */
blkif_free(sc, 1);
- err = talk_to_backend(sc);
- if (sc->connected == BLKIF_STATE_SUSPENDED && !err)
+ blkfront_initialize(sc);
+ if (sc->connected == BLKIF_STATE_SUSPENDED)
blkif_recover(sc);
-
- return (err);
+#endif
+ return (0);
}
-/* Common code used when first setting up, and when resuming. */
-static int
-talk_to_backend(struct xb_softc *sc)
+static void
+blkfront_initialize(struct xb_softc *sc)
{
- device_t dev;
- struct xenbus_transaction xbt;
- const char *message = NULL;
- int err;
+ const char *otherend_path;
+ const char *node_path;
+ int error;
+ int i;
- /* Create shared ring, alloc event channel. */
- dev = sc->xb_dev;
- err = setup_blkring(sc);
- if (err)
- goto out;
+ if (xenbus_get_state(sc->xb_dev) != XenbusStateInitialising)
+ return;
- again:
- err = xenbus_transaction_start(&xbt);
- if (err) {
- xenbus_dev_fatal(dev, err, "starting transaction");
- goto destroy_blkring;
+ /*
+ * Protocol defaults valid even if negotiation for a
+ * setting fails.
+ */
+ sc->ring_pages = 1;
+ sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
+ sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+ sc->max_request_size = sc->max_request_segments * PAGE_SIZE;
+ sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
+
+ /*
+ * Protocol negotiation.
+ *
+ * \note xs_gather() returns on the first encountered error, so
+ * we must use independant calls in order to guarantee
+ * we don't miss information in a sparsly populated back-end
+ * tree.
+ */
+ otherend_path = xenbus_get_otherend_path(sc->xb_dev);
+ node_path = xenbus_get_node(sc->xb_dev);
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-ring-pages", NULL, "%" PRIu32,
+ &sc->ring_pages);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-requests", NULL, "%" PRIu32,
+ &sc->max_requests);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-segments", NULL, "%" PRIu32,
+ &sc->max_request_segments);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-size", NULL, "%" PRIu32,
+ &sc->max_request_size);
+
+ if (sc->ring_pages > XBF_MAX_RING_PAGES) {
+ device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+ "%u limited to front-end limit of %zu.\n",
+ sc->ring_pages, XBF_MAX_RING_PAGES);
+ sc->ring_pages = XBF_MAX_RING_PAGES;
}
- err = xenbus_printf(xbt, xenbus_get_node(dev),
- "ring-ref","%u", sc->ring_ref);
- if (err) {
- message = "writing ring-ref";
- goto abort_transaction;
+ if (sc->max_requests > XBF_MAX_REQUESTS) {
+ device_printf(sc->xb_dev, "Back-end specified max_requests of "
+ "%u limited to front-end limit of %u.\n",
+ sc->max_requests, XBF_MAX_REQUESTS);
+ sc->max_requests = XBF_MAX_REQUESTS;
}
- err = xenbus_printf(xbt, xenbus_get_node(dev),
- "event-channel", "%u", irq_to_evtchn_port(sc->irq));
- if (err) {
- message = "writing event-channel";
- goto abort_transaction;
+
+ if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) {
+ device_printf(sc->xb_dev, "Back-end specificed "
+ "max_requests_segments of %u limited to "
+ "front-end limit of %u.\n",
+ sc->max_request_segments,
+ XBF_MAX_SEGMENTS_PER_REQUEST);
+ sc->max_request_segments = XBF_MAX_SEGMENTS_PER_REQUEST;
}
- err = xenbus_printf(xbt, xenbus_get_node(dev),
- "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
- if (err) {
- message = "writing protocol";
- goto abort_transaction;
+
+ if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) {
+ device_printf(sc->xb_dev, "Back-end specificed "
+ "max_request_size of %u limited to front-end "
+ "limit of %u.\n", sc->max_request_size,
+ XBF_MAX_REQUEST_SIZE);
+ sc->max_request_size = XBF_MAX_REQUEST_SIZE;
+ }
+ sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
+
+ /* Allocate datastructures based on negotiated values. */
+ error = bus_dma_tag_create(NULL, /* parent */
+ 512, PAGE_SIZE, /* algnmnt, boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ sc->max_request_size,
+ sc->max_request_segments,
+ PAGE_SIZE, /* maxsegsize */
+ BUS_DMA_ALLOCNOW, /* flags */
+ busdma_lock_mutex, /* lockfunc */
+ &sc->xb_io_lock, /* lockarg */
+ &sc->xb_io_dmat);
+ if (error != 0) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "Cannot allocate parent DMA tag\n");
+ return;
}
- err = xenbus_transaction_end(xbt, 0);
- if (err) {
- if (err == EAGAIN)
- goto again;
- xenbus_dev_fatal(dev, err, "completing transaction");
- goto destroy_blkring;
+ /* Per-transaction data allocation. */
+ sc->shadow = malloc(sizeof(*sc->shadow) * sc->max_requests,
+ M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
+ if (sc->shadow == NULL) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "Cannot allocate request structures\n");
}
- xenbus_set_state(dev, XenbusStateInitialised);
-
- return 0;
- abort_transaction:
- xenbus_transaction_end(xbt, 1);
- if (message)
- xenbus_dev_fatal(dev, err, "%s", message);
- destroy_blkring:
- blkif_free(sc, 0);
- out:
- return err;
+ for (i = 0; i < sc->max_requests; i++) {
+ struct xb_command *cm;
+
+ cm = &sc->shadow[i];
+ cm->sg_refs = malloc(sizeof(grant_ref_t)
+ * sc->max_request_segments,
+ M_XENBLOCKFRONT, M_NOWAIT);
+ if (cm->sg_refs == NULL)
+ break;
+ cm->id = i;
+ cm->cm_sc = sc;
+ if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
+ break;
+ xb_free_command(cm);
+ }
+
+ if (setup_blkring(sc) != 0)
+ return;
+
+ error = xs_printf(XST_NIL, node_path,
+ "ring-pages","%u", sc->ring_pages);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/ring-pages",
+ node_path);
+ return;
+ }
+
+ error = xs_printf(XST_NIL, node_path,
+ "max-requests","%u", sc->max_requests);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/max-requests",
+ node_path);
+ return;
+ }
+
+ error = xs_printf(XST_NIL, node_path,
+ "max-request-segments","%u", sc->max_request_segments);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/max-request-segments",
+ node_path);
+ return;
+ }
+
+ error = xs_printf(XST_NIL, node_path,
+ "max-request-size","%u", sc->max_request_size);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/max-request-size",
+ node_path);
+ return;
+ }
+
+ error = xs_printf(XST_NIL, node_path, "event-channel",
+ "%u", irq_to_evtchn_port(sc->irq));
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/event-channel",
+ node_path);
+ return;
+ }
+
+ error = xs_printf(XST_NIL, node_path,
+ "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/protocol",
+ node_path);
+ return;
+ }
+
+ xenbus_set_state(sc->xb_dev, XenbusStateInitialised);
}
static int
setup_blkring(struct xb_softc *sc)
{
blkif_sring_t *sring;
+ uintptr_t sring_page_addr;
int error;
+ int i;
- sc->ring_ref = GRANT_INVALID_REF;
-
- sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
+ sring = malloc(sc->ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
+ M_NOWAIT|M_ZERO);
if (sring == NULL) {
xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring");
- return ENOMEM;
+ return (ENOMEM);
}
SHARED_RING_INIT(sring);
- FRONT_RING_INIT(&sc->ring, sring, PAGE_SIZE);
-
- error = xenbus_grant_ring(sc->xb_dev,
- (vtomach(sc->ring.sring) >> PAGE_SHIFT), &sc->ring_ref);
+ FRONT_RING_INIT(&sc->ring, sring, sc->ring_pages * PAGE_SIZE);
+
+ for (i = 0, sring_page_addr = (uintptr_t)sring;
+ i < sc->ring_pages;
+ i++, sring_page_addr += PAGE_SIZE) {
+
+ error = xenbus_grant_ring(sc->xb_dev,
+ (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->ring_ref[i]);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "granting ring_ref(%d)", i);
+ return (error);
+ }
+ }
+ error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+ "ring-ref","%u", sc->ring_ref[0]);
if (error) {
- free(sring, M_DEVBUF);
- sc->ring.sring = NULL;
- goto fail;
+ xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
+ xenbus_get_node(sc->xb_dev));
+ return (error);
}
-
- error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(sc->xb_dev),
+ for (i = 1; i < sc->ring_pages; i++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+ error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+ ring_ref_name, "%u", sc->ring_ref[i]);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
+ xenbus_get_node(sc->xb_dev),
+ ring_ref_name);
+ return (error);
+ }
+ }
+
+ error = bind_listening_port_to_irqhandler(
+ xenbus_get_otherend_id(sc->xb_dev),
"xbd", (driver_intr_t *)blkif_int, sc,
INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq);
if (error) {
xenbus_dev_fatal(sc->xb_dev, error,
"bind_evtchn_to_irqhandler failed");
- goto fail;
+ return (error);
}
return (0);
- fail:
- blkif_free(sc, 0);
- return (error);
}
-
/**
* Callback received when the backend's state changes.
*/
@@ -640,15 +748,19 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
switch (backend_state) {
case XenbusStateUnknown:
case XenbusStateInitialising:
- case XenbusStateInitWait:
- case XenbusStateInitialised:
- case XenbusStateClosed:
case XenbusStateReconfigured:
case XenbusStateReconfiguring:
+ case XenbusStateClosed:
break;
+ case XenbusStateInitWait:
+ blkfront_initialize(sc);
+ break;
+
+ case XenbusStateInitialised:
case XenbusStateConnected:
- connect(sc);
+ blkfront_initialize(sc);
+ blkfront_connect(sc);
break;
case XenbusStateClosing:
@@ -657,20 +769,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
"Device in use; refusing to close");
else
blkfront_closing(dev);
-#ifdef notyet
- bd = bdget(sc->dev);
- if (bd == NULL)
- xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-
- down(&bd->bd_sem);
- if (sc->users > 0)
- xenbus_dev_error(dev, -EBUSY,
- "Device in use; refusing to close");
- else
- blkfront_closing(dev);
- up(&bd->bd_sem);
- bdput(bd);
-#endif
+ break;
}
return (0);
@@ -681,7 +780,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
** the details about the physical device - #sectors, size, etc).
*/
static void
-connect(struct xb_softc *sc)
+blkfront_connect(struct xb_softc *sc)
{
device_t dev = sc->xb_dev;
unsigned long sectors, sector_size;
@@ -694,20 +793,20 @@ connect(struct xb_softc *sc)
DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
- err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
- "sectors", "%lu", &sectors,
- "info", "%u", &binfo,
- "sector-size", "%lu", &sector_size,
- NULL);
+ err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
+ "sectors", "%lu", &sectors,
+ "info", "%u", &binfo,
+ "sector-size", "%lu", &sector_size,
+ NULL);
if (err) {
xenbus_dev_fatal(dev, err,
"reading backend fields at %s",
xenbus_get_otherend_path(dev));
return;
}
- err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
- "feature-barrier", "%lu", &feature_barrier,
- NULL);
+ err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
+ "feature-barrier", "%lu", &feature_barrier,
+ NULL);
if (!err || feature_barrier)
sc->xb_flags |= XB_BARRIER;
@@ -741,15 +840,16 @@ blkfront_closing(device_t dev)
{
struct xb_softc *sc = device_get_softc(dev);
+ xenbus_set_state(dev, XenbusStateClosing);
+
DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
- if (sc->mi) {
- DPRINTK("Calling xlvbd_del\n");
- xlvbd_del(sc);
- sc->mi = NULL;
+ if (sc->xb_disk != NULL) {
+ disk_destroy(sc->xb_disk);
+ sc->xb_disk = NULL;
}
- xenbus_set_state(dev, XenbusStateClosed);
+ xenbus_set_state(dev, XenbusStateClosed);
}
@@ -778,11 +878,16 @@ flush_requests(struct xb_softc *sc)
notify_remote_via_irq(sc->irq);
}
-static void blkif_restart_queue_callback(void *arg)
+static void
+blkif_restart_queue_callback(void *arg)
{
struct xb_softc *sc = arg;
+ mtx_lock(&sc->xb_io_lock);
+
xb_startio(sc);
+
+ mtx_unlock(&sc->xb_io_lock);
}
static int
@@ -874,20 +979,17 @@ xb_bio_command(struct xb_softc *sc)
return (NULL);
}
- if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST,
- &cm->gref_head) < 0) {
+ if (gnttab_alloc_grant_references(sc->max_request_segments,
+ &cm->gref_head) != 0) {
gnttab_request_free_callback(&sc->callback,
blkif_restart_queue_callback, sc,
- BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ sc->max_request_segments);
xb_requeue_bio(sc, bp);
xb_enqueue_free(cm);
sc->xb_flags |= XB_FROZEN;
return (NULL);
}
- /* XXX Can we grab refs before doing the load so that the ref can
- * be filled out here?
- */
cm->bp = bp;
cm->data = bp->bio_data;
cm->datalen = bp->bio_bcount;
@@ -921,13 +1023,19 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
struct xb_softc *sc;
struct xb_command *cm;
blkif_request_t *ring_req;
+ struct blkif_request_segment *sg;
+ struct blkif_request_segment *last_block_sg;
+ grant_ref_t *sg_ref;
vm_paddr_t buffer_ma;
uint64_t fsect, lsect;
- int ref, i, op;
+ int ref;
+ int op;
+ int block_segs;
cm = arg;
sc = cm->cm_sc;
+//printf("%s: Start\n", __func__);
if (error) {
printf("error %d in blkif_queue_cb\n", error);
cm->bp->bio_error = EIO;
@@ -938,43 +1046,62 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
- if (ring_req == NULL) {
- /* XXX Is this possible? */
- printf("ring_req NULL, requeuing\n");
- xb_enqueue_ready(cm);
- return;
- }
- ring_req->id = cm->req.id;
+ sc->ring.req_prod_pvt++;
+ ring_req->id = cm->id;
ring_req->operation = cm->operation;
ring_req->sector_number = cm->sector_number;
ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
ring_req->nr_segments = nsegs;
+ cm->nseg = nsegs;
+
+ block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
+ sg = ring_req->seg;
+ last_block_sg = sg + block_segs;
+ sg_ref = cm->sg_refs;
+
+ while (1) {
- for (i = 0; i < nsegs; i++) {
- buffer_ma = segs[i].ds_addr;
- fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
- lsect = fsect + (segs[i].ds_len >> XBD_SECTOR_SHFT) - 1;
+ while (sg < last_block_sg) {
+ buffer_ma = segs->ds_addr;
+ fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
+ lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1;
- KASSERT(lsect <= 7,
- ("XEN disk driver data cannot cross a page boundary"));
+ KASSERT(lsect <= 7, ("XEN disk driver data cannot "
+ "cross a page boundary"));
- /* install a grant reference. */
- ref = gnttab_claim_grant_reference(&cm->gref_head);
- KASSERT( ref >= 0, ("grant_reference failed") );
+ /* install a grant reference. */
+ ref = gnttab_claim_grant_reference(&cm->gref_head);
+
+ /*
+ * GNTTAB_LIST_END == 0xffffffff, but it is private
+ * to gnttab.c.
+ */
+ KASSERT(ref != ~0, ("grant_reference failed"));
- gnttab_grant_foreign_access_ref(
- ref,
- xenbus_get_otherend_id(sc->xb_dev),
- buffer_ma >> PAGE_SHIFT,
- ring_req->operation & 1 ); /* ??? */
+ gnttab_grant_foreign_access_ref(
+ ref,
+ xenbus_get_otherend_id(sc->xb_dev),
+ buffer_ma >> PAGE_SHIFT,
+ ring_req->operation == BLKIF_OP_WRITE);
- ring_req->seg[i] =
- (struct blkif_request_segment) {
+ *sg_ref = ref;
+ *sg = (struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
.last_sect = lsect };
- }
+ sg++;
+ sg_ref++;
+ segs++;
+ nsegs--;
+ }
+ block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
+ if (block_segs == 0)
+ break;
+ sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+ sc->ring.req_prod_pvt++;
+ last_block_sg = sg + block_segs;
+ }
if (cm->operation == BLKIF_OP_READ)
op = BUS_DMASYNC_PREREAD;
@@ -984,15 +1111,10 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
op = 0;
bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
- sc->ring.req_prod_pvt++;
-
- /* Keep a private copy so we can reissue requests when recovering. */
- cm->req = *ring_req;
+ gnttab_free_grant_references(cm->gref_head);
xb_enqueue_busy(cm);
- gnttab_free_grant_references(cm->gref_head);
-
/*
* This flag means that we're probably executing in the busdma swi
* instead of in the startio context, so an explicit flush is needed.
@@ -1000,6 +1122,7 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
if (cm->cm_flags & XB_CMD_FROZEN)
flush_requests(sc);
+//printf("%s: Done\n", __func__);
return;
}
@@ -1018,7 +1141,7 @@ xb_startio(struct xb_softc *sc)
mtx_assert(&sc->xb_io_lock, MA_OWNED);
- while (!RING_FULL(&sc->ring)) {
+ while (RING_FREE_REQUESTS(&sc->ring) >= sc->max_request_blocks) {
if (sc->xb_flags & XB_FROZEN)
break;
@@ -1061,12 +1184,12 @@ blkif_int(void *xsc)
rp = sc->ring.sring->rsp_prod;
rmb(); /* Ensure we see queued responses up to 'rp'. */
- for (i = sc->ring.rsp_cons; i != rp; i++) {
+ for (i = sc->ring.rsp_cons; i != rp;) {
bret = RING_GET_RESPONSE(&sc->ring, i);
cm = &sc->shadow[bret->id];
xb_remove_busy(cm);
- blkif_completion(cm);
+ i += blkif_completion(cm);
if (cm->operation == BLKIF_OP_READ)
op = BUS_DMASYNC_POSTREAD;
@@ -1116,35 +1239,61 @@ blkif_int(void *xsc)
static void
blkif_free(struct xb_softc *sc, int suspend)
{
+ uint8_t *sring_page_ptr;
+ int i;
-/* Prevent new requests being issued until we fix things up. */
+ /* Prevent new requests being issued until we fix things up. */
mtx_lock(&sc->xb_io_lock);
sc->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
mtx_unlock(&sc->xb_io_lock);
/* Free resources associated with old device channel. */
- if (sc->ring_ref != GRANT_INVALID_REF) {
- gnttab_end_foreign_access(sc->ring_ref,
- sc->ring.sring);
- sc->ring_ref = GRANT_INVALID_REF;
+ if (sc->ring.sring != NULL) {
+ sring_page_ptr = (uint8_t *)sc->ring.sring;
+ for (i = 0; i < sc->ring_pages; i++) {
+ if (sc->ring_ref[i] != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access_ref(sc->ring_ref[i]);
+ sc->ring_ref[i] = GRANT_INVALID_REF;
+ }
+ sring_page_ptr += PAGE_SIZE;
+ }
+ free(sc->ring.sring, M_XENBLOCKFRONT);
sc->ring.sring = NULL;
}
- if (sc->irq)
- unbind_from_irqhandler(sc->irq);
- sc->irq = 0;
+ if (sc->shadow) {
+
+ for (i = 0; i < sc->max_requests; i++) {
+ struct xb_command *cm;
+
+ cm = &sc->shadow[i];
+ if (cm->sg_refs != NULL) {
+ free(cm->sg_refs, M_XENBLOCKFRONT);
+ cm->sg_refs = NULL;
+ }
+
+ bus_dmamap_destroy(sc->xb_io_dmat, cm->map);
+ }
+ free(sc->shadow, M_XENBLOCKFRONT);
+ sc->shadow = NULL;
+ }
+
+ if (sc->irq) {
+ unbind_from_irqhandler(sc->irq);
+ sc->irq = 0;
+ }
}
-static void
+static int
blkif_completion(struct xb_command *s)
{
- int i;
-
- for (i = 0; i < s->req.nr_segments; i++)
- gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
+//printf("%s: Req %p(%d)\n", __func__, s, s->nseg);
+ gnttab_end_foreign_access_references(s->nseg, s->sg_refs);
+ return (BLKIF_SEGS_TO_BLOCKS(s->nseg));
}
+#if 0
static void
blkif_recover(struct xb_softc *sc)
{
@@ -1157,6 +1306,7 @@ blkif_recover(struct xb_softc *sc)
* has been removed until further notice.
*/
}
+#endif
/* ** Driver registration ** */
static device_method_t blkfront_methods[] = {
@@ -1169,7 +1319,7 @@ static device_method_t blkfront_methods[] = {
DEVMETHOD(device_resume, blkfront_resume),
/* Xenbus interface */
- DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
+ DEVMETHOD(xenbus_otherend_changed, blkfront_backend_changed),
{ 0, 0 }
};
@@ -1181,4 +1331,4 @@ static driver_t blkfront_driver = {
};
devclass_t blkfront_devclass;
-DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0);
+DRIVER_MODULE(xbd, xenbusb_front, blkfront_driver, blkfront_devclass, 0, 0);
diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h
index 32bfc96..6235e51 100644
--- a/sys/dev/xen/blkfront/block.h
+++ b/sys/dev/xen/blkfront/block.h
@@ -32,7 +32,43 @@
#ifndef __XEN_DRIVERS_BLOCK_H__
#define __XEN_DRIVERS_BLOCK_H__
-#include <xen/interface/io/blkif.h>
+#include <xen/blkif.h>
+
+/**
+ * The maximum number of outstanding requests blocks (request headers plus
+ * additional segment blocks) we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBF_MAX_REQUESTS 256
+
+/**
+ * The maximum mapped region size per request we will allow in a negotiated
+ * block-front/back communication channel.
+ *
+ * \note We reserve a segement from the maximum supported by the transport to
+ * guarantee we can handle an unaligned transfer without the need to
+ * use a bounce buffer..
+ */
+#define XBF_MAX_REQUEST_SIZE \
+ MIN(MAXPHYS, (BLKIF_MAX_SEGMENTS_PER_REQUEST - 1) * PAGE_SIZE)
+
+/**
+ * The maximum number of segments (within a request header and accompanying
+ * segment blocks) per request we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBF_MAX_SEGMENTS_PER_REQUEST \
+ (MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
+ (XBF_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))
+
+/**
+ * The maximum number of shared memory ring pages we will allow in a
+ * negotiated block-front/back communication channel. Allow enough
+ * ring space for all requests to be XBF_MAX_REQUEST_SIZE'd.
+ */
+#define XBF_MAX_RING_PAGES \
+ BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBF_MAX_SEGMENTS_PER_REQUEST) \
+ * XBF_MAX_REQUESTS)
struct xlbd_type_info
{
@@ -62,19 +98,19 @@ struct xb_command {
#define XB_ON_XBQ_COMPLETE (1<<5)
#define XB_ON_XBQ_MASK ((1<<2)|(1<<3)|(1<<4)|(1<<5))
bus_dmamap_t map;
- blkif_request_t req;
+ uint64_t id;
+ grant_ref_t *sg_refs;
struct bio *bp;
grant_ref_t gref_head;
void *data;
size_t datalen;
+ u_int nseg;
int operation;
blkif_sector_t sector_number;
int status;
void (* cm_complete)(struct xb_command *);
};
-#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
-
#define XBQ_FREE 0
#define XBQ_BIO 1
#define XBQ_READY 2
@@ -108,10 +144,14 @@ struct xb_softc {
int vdevice;
blkif_vdev_t handle;
int connected;
- int ring_ref;
+ u_int ring_pages;
+ uint32_t max_requests;
+ uint32_t max_request_segments;
+ uint32_t max_request_blocks;
+ uint32_t max_request_size;
+ grant_ref_t ring_ref[XBF_MAX_RING_PAGES];
blkif_front_ring_t ring;
unsigned int irq;
- struct xlbd_major_info *mi;
struct gnttab_free_callback callback;
TAILQ_HEAD(,xb_command) cm_free;
TAILQ_HEAD(,xb_command) cm_ready;
@@ -126,11 +166,12 @@ struct xb_softc {
*/
int users;
struct mtx xb_io_lock;
- struct xb_command shadow[BLK_RING_SIZE];
+
+ struct xb_command *shadow;
};
-int xlvbd_add(struct xb_softc *, blkif_sector_t capacity, int device,
- uint16_t vdisk_info, uint16_t sector_size);
+int xlvbd_add(struct xb_softc *, blkif_sector_t sectors, int device,
+ uint16_t vdisk_info, unsigned long sector_size);
void xlvbd_del(struct xb_softc *);
#define XBQ_ADD(sc, qname) \
@@ -188,7 +229,8 @@ void xlvbd_del(struct xb_softc *);
struct xb_command *cm; \
\
if ((cm = TAILQ_FIRST(&sc->cm_ ## name)) != NULL) { \
- if ((cm->cm_flags & XB_ON_ ## index) == 0) { \
+ if ((cm->cm_flags & XB_ON_XBQ_MASK) != \
+ XB_ON_ ## index) { \
printf("command %p not in queue, " \
"flags = %#x, bit = %#x\n", cm, \
cm->cm_flags, XB_ON_ ## index); \
@@ -203,7 +245,7 @@ void xlvbd_del(struct xb_softc *);
static __inline void \
xb_remove_ ## name (struct xb_command *cm) \
{ \
- if ((cm->cm_flags & XB_ON_ ## index) == 0) { \
+ if ((cm->cm_flags & XB_ON_XBQ_MASK) != XB_ON_ ## index){\
printf("command %p not in queue, flags = %#x, " \
"bit = %#x\n", cm, cm->cm_flags, \
XB_ON_ ## index); \
diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c
new file mode 100644
index 0000000..c03d536
--- /dev/null
+++ b/sys/dev/xen/control/control.c
@@ -0,0 +1,493 @@
+/*-
+ * Copyright (c) 2010 Justin T. Gibbs, Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * substantially similar to the "NO WARRANTY" disclaimer below
+ * ("Disclaimer") and any redistribution must be conditioned upon
+ * including a substantially similar Disclaimer requirement for further
+ * binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/*-
+ * PV suspend/resume support:
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * HVM suspend/resume support:
+ *
+ * Copyright (c) 2008 Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/**
+ * \file control.c
+ *
+ * \brief Device driver to repond to control domain events that impact
+ * this VM.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/kdb.h>
+#include <sys/module.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/types.h>
+#include <sys/vnode.h>
+
+#ifndef XENHVM
+#include <sys/sched.h>
+#include <sys/smp.h>
+#endif
+
+
+#include <geom/geom.h>
+
+#include <machine/_inttypes.h>
+#include <machine/xen/xen-os.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#include <xen/blkif.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
+
+#include <xen/xenbus/xenbusvar.h>
+
+#define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*(x)))
+
+/*--------------------------- Forward Declarations --------------------------*/
+/** Function signature for shutdown event handlers. */
+typedef void (xctrl_shutdown_handler_t)(void);
+
+static xctrl_shutdown_handler_t xctrl_poweroff;
+static xctrl_shutdown_handler_t xctrl_reboot;
+static xctrl_shutdown_handler_t xctrl_suspend;
+static xctrl_shutdown_handler_t xctrl_crash;
+static xctrl_shutdown_handler_t xctrl_halt;
+
+/*-------------------------- Private Data Structures -------------------------*/
+/** Element type for lookup table of event name to handler. */
+struct xctrl_shutdown_reason {
+ const char *name;
+ xctrl_shutdown_handler_t *handler;
+};
+
+/** Lookup table for shutdown event name to handler. */
+static struct xctrl_shutdown_reason xctrl_shutdown_reasons[] = {
+ { "poweroff", xctrl_poweroff },
+ { "reboot", xctrl_reboot },
+ { "suspend", xctrl_suspend },
+ { "crash", xctrl_crash },
+ { "halt", xctrl_halt },
+};
+
+struct xctrl_softc {
+
+ /** Must be first */
+ struct xs_watch xctrl_watch;
+};
+
+/*------------------------------ Event Handlers ------------------------------*/
+static void
+xctrl_poweroff()
+{
+ shutdown_nice(RB_POWEROFF|RB_HALT);
+}
+
+static void
+xctrl_reboot()
+{
+ shutdown_nice(0);
+}
+
+#ifndef XENHVM
+extern void xencons_suspend(void);
+extern void xencons_resume(void);
+
+/* Full PV mode suspension. */
+static void
+xctrl_suspend()
+{
+ int i, j, k, fpp;
+ unsigned long max_pfn, start_info_mfn;
+
+#ifdef SMP
+ cpumask_t map;
+ /*
+ * Bind us to CPU 0 and stop any other VCPUs.
+ */
+ thread_lock(curthread);
+ sched_bind(curthread, 0);
+ thread_unlock(curthread);
+ KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0"));
+
+ map = PCPU_GET(other_cpus) & ~stopped_cpus;
+ if (map)
+ stop_cpus(map);
+#endif
+
+ if (DEVICE_SUSPEND(root_bus) != 0) {
+ printf("xen_suspend: device_suspend failed\n");
+#ifdef SMP
+ if (map)
+ restart_cpus(map);
+#endif
+ return;
+ }
+
+ local_irq_disable();
+
+ xencons_suspend();
+ gnttab_suspend();
+
+ max_pfn = HYPERVISOR_shared_info->arch.max_pfn;
+
+ void *shared_info = HYPERVISOR_shared_info;
+ HYPERVISOR_shared_info = NULL;
+ pmap_kremove((vm_offset_t) shared_info);
+ PT_UPDATES_FLUSH();
+
+ xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn);
+
+ /*
+ * We'll stop somewhere inside this hypercall. When it returns,
+ * we'll start resuming after the restore.
+ */
+ start_info_mfn = VTOMFN(xen_start_info);
+ pmap_suspend();
+ HYPERVISOR_suspend(start_info_mfn);
+ pmap_resume();
+
+ pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = shared_info;
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ VTOMFN(xen_pfn_to_mfn_frame_list_list);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+ if ((j % fpp) == 0) {
+ k++;
+ xen_pfn_to_mfn_frame_list_list[k] =
+ VTOMFN(xen_pfn_to_mfn_frame_list[k]);
+ j = 0;
+ }
+ xen_pfn_to_mfn_frame_list[k][j] =
+ VTOMFN(&xen_phys_machine[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+
+ gnttab_resume();
+ irq_resume();
+ local_irq_enable();
+ xencons_resume();
+
+#ifdef CONFIG_SMP
+ for_each_cpu(i)
+ vcpu_prepare(i);
+
+#endif
+ /*
+ * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
+ * the VCPU hotplug callback can race with our vcpu_prepare
+ */
+ DEVICE_RESUME(root_bus);
+
+#ifdef SMP
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+ if (map)
+ restart_cpus(map);
+#endif
+}
+
+static void
+xen_pv_shutdown_final(void *arg, int howto)
+{
+ /*
+ * Inform the hypervisor that shutdown is complete.
+ * This is not necessary in HVM domains since Xen
+ * emulates ACPI in that mode and FreeBSD's ACPI
+ * support will request this transition.
+ */
+ if (howto & (RB_HALT | RB_POWEROFF))
+ HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+ else
+ HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+#else
+extern void xenpci_resume(void);
+
+/* HVM mode suspension. */
+static void
+xctrl_suspend()
+{
+ int suspend_cancelled;
+
+ if (DEVICE_SUSPEND(root_bus)) {
+ printf("xen_suspend: device_suspend failed\n");
+ return;
+ }
+
+ /*
+ * Make sure we don't change cpus or switch to some other
+ * thread. for the duration.
+ */
+ critical_enter();
+
+ /*
+ * Prevent any races with evtchn_interrupt() handler.
+ */
+ irq_suspend();
+ disable_intr();
+
+ suspend_cancelled = HYPERVISOR_suspend(0);
+ if (!suspend_cancelled)
+ xenpci_resume();
+
+ /*
+ * Re-enable interrupts and put the scheduler back to normal.
+ */
+ enable_intr();
+ critical_exit();
+
+ /*
+ * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or
+ * similar.
+ */
+ if (!suspend_cancelled)
+ DEVICE_RESUME(root_bus);
+}
+#endif
+
+static void
+xctrl_crash()
+{
+ panic("Xen directed crash");
+}
+
+static void
+xctrl_halt()
+{
+ shutdown_nice(RB_HALT);
+}
+
+/*------------------------------ Event Reception -----------------------------*/
+static void
+xctrl_on_watch_event(struct xs_watch *watch, const char **vec, unsigned int len)
+{
+ struct xctrl_shutdown_reason *reason;
+ struct xctrl_shutdown_reason *last_reason;
+ char *result;
+ int error;
+ int result_len;
+
+ error = xs_read(XST_NIL, "control", "shutdown",
+ &result_len, (void **)&result);
+ if (error != 0)
+ return;
+
+ reason = xctrl_shutdown_reasons;
+ last_reason = reason + NUM_ELEMENTS(xctrl_shutdown_reasons);
+ while (reason < last_reason) {
+
+ if (!strcmp(result, reason->name)) {
+ reason->handler();
+ break;
+ }
+ reason++;
+ }
+
+ free(result, M_XENSTORE);
+}
+
+/*------------------ Private Device Attachment Functions --------------------*/
+/**
+ * \brief Identify instances of this device type in the system.
+ *
+ * \param driver The driver performing this identify action.
+ * \param parent The NewBus parent device for any devices this method adds.
+ */
+static void
+xctrl_identify(driver_t *driver __unused, device_t parent)
+{
+ /*
+ * A single device instance for our driver is always present
+ * in a system operating under Xen.
+ */
+ BUS_ADD_CHILD(parent, 0, driver->name, 0);
+}
+
+/**
+ * \brief Probe for the existance of the Xen Control device
+ *
+ * \param dev NewBus device_t for this Xen control instance.
+ *
+ * \return Always returns 0 indicating success.
+ */
+static int
+xctrl_probe(device_t dev)
+{
+ device_set_desc(dev, "Xen Control Device");
+
+ return (0);
+}
+
+/**
+ * \brief Attach the Xen control device.
+ *
+ * \param dev NewBus device_t for this Xen control instance.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xctrl_attach(device_t dev)
+{
+ struct xctrl_softc *xctrl;
+
+ xctrl = device_get_softc(dev);
+
+ /* Activate watch */
+ xctrl->xctrl_watch.node = "control/shutdown";
+ xctrl->xctrl_watch.callback = xctrl_on_watch_event;
+ xs_register_watch(&xctrl->xctrl_watch);
+
+#ifndef XENHVM
+ EVENTHANDLER_REGISTER(shutdown_final, xen_pv_shutdown_final, NULL,
+ SHUTDOWN_PRI_LAST);
+#endif
+
+ return (0);
+}
+
+/**
+ * \brief Detach the Xen control device.
+ *
+ * \param dev NewBus device_t for this Xen control device instance.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xctrl_detach(device_t dev)
+{
+ struct xctrl_softc *xctrl;
+
+ xctrl = device_get_softc(dev);
+
+ /* Release watch */
+ xs_unregister_watch(&xctrl->xctrl_watch);
+
+ return (0);
+}
+
+/*-------------------- Private Device Attachment Data -----------------------*/
+static device_method_t xctrl_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, xctrl_identify),
+ DEVMETHOD(device_probe, xctrl_probe),
+ DEVMETHOD(device_attach, xctrl_attach),
+ DEVMETHOD(device_detach, xctrl_detach),
+
+ { 0, 0 }
+};
+
+DEFINE_CLASS_0(xctrl, xctrl_driver, xctrl_methods, sizeof(struct xctrl_softc));
+devclass_t xctrl_devclass;
+
+DRIVER_MODULE(xctrl, xenstore, xctrl_driver, xctrl_devclass, 0, 0);
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index a6fd9ea..423df97 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -91,8 +91,6 @@ __FBSDID("$FreeBSD$");
#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO)
-#define GRANT_INVALID_REF 0
-
#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
@@ -373,7 +371,8 @@ xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri)
{
int i = xennet_rxidx(ri);
grant_ref_t ref = np->grant_rx_ref[i];
- np->grant_rx_ref[i] = GRANT_INVALID_REF;
+ KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n"));
+ np->grant_rx_ref[i] = GRANT_REF_INVALID;
return ref;
}
@@ -404,7 +403,7 @@ xen_net_read_mac(device_t dev, uint8_t mac[])
int error, i;
char *s, *e, *macstr;
- error = xenbus_read(XBT_NIL, xenbus_get_node(dev), "mac", NULL,
+ error = xs_read(XST_NIL, xenbus_get_node(dev), "mac", NULL,
(void **) &macstr);
if (error)
return (error);
@@ -413,12 +412,12 @@ xen_net_read_mac(device_t dev, uint8_t mac[])
for (i = 0; i < ETHER_ADDR_LEN; i++) {
mac[i] = strtoul(s, &e, 16);
if (s == e || (e[0] != ':' && e[0] != 0)) {
- free(macstr, M_DEVBUF);
+ free(macstr, M_XENBUS);
return (ENOENT);
}
s = &e[1];
}
- free(macstr, M_DEVBUF);
+ free(macstr, M_XENBUS);
return (0);
}
@@ -483,7 +482,7 @@ static int
talk_to_backend(device_t dev, struct netfront_info *info)
{
const char *message;
- struct xenbus_transaction xbt;
+ struct xs_transaction xst;
const char *node = xenbus_get_node(dev);
int err;
@@ -499,54 +498,54 @@ talk_to_backend(device_t dev, struct netfront_info *info)
goto out;
again:
- err = xenbus_transaction_start(&xbt);
+ err = xs_transaction_start(&xst);
if (err) {
xenbus_dev_fatal(dev, err, "starting transaction");
goto destroy_ring;
}
- err = xenbus_printf(xbt, node, "tx-ring-ref","%u",
+ err = xs_printf(xst, node, "tx-ring-ref","%u",
info->tx_ring_ref);
if (err) {
message = "writing tx ring-ref";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node, "rx-ring-ref","%u",
+ err = xs_printf(xst, node, "rx-ring-ref","%u",
info->rx_ring_ref);
if (err) {
message = "writing rx ring-ref";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node,
+ err = xs_printf(xst, node,
"event-channel", "%u", irq_to_evtchn_port(info->irq));
if (err) {
message = "writing event-channel";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node, "request-rx-copy", "%u",
+ err = xs_printf(xst, node, "request-rx-copy", "%u",
info->copying_receiver);
if (err) {
message = "writing request-rx-copy";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node, "feature-rx-notify", "%d", 1);
+ err = xs_printf(xst, node, "feature-rx-notify", "%d", 1);
if (err) {
message = "writing feature-rx-notify";
goto abort_transaction;
}
- err = xenbus_printf(xbt, node, "feature-sg", "%d", 1);
+ err = xs_printf(xst, node, "feature-sg", "%d", 1);
if (err) {
message = "writing feature-sg";
goto abort_transaction;
}
#if __FreeBSD_version >= 700000
- err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1);
+ err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1);
if (err) {
message = "writing feature-gso-tcpv4";
goto abort_transaction;
}
#endif
- err = xenbus_transaction_end(xbt, 0);
+ err = xs_transaction_end(xst, 0);
if (err) {
if (err == EAGAIN)
goto again;
@@ -557,7 +556,7 @@ talk_to_backend(device_t dev, struct netfront_info *info)
return 0;
abort_transaction:
- xenbus_transaction_end(xbt, 1);
+ xs_transaction_end(xst, 1);
xenbus_dev_fatal(dev, err, "%s", message);
destroy_ring:
netif_free(info);
@@ -576,8 +575,8 @@ setup_device(device_t dev, struct netfront_info *info)
ifp = info->xn_ifp;
- info->tx_ring_ref = GRANT_INVALID_REF;
- info->rx_ring_ref = GRANT_INVALID_REF;
+ info->tx_ring_ref = GRANT_REF_INVALID;
+ info->rx_ring_ref = GRANT_REF_INVALID;
info->rx.sring = NULL;
info->tx.sring = NULL;
info->irq = 0;
@@ -750,7 +749,7 @@ netif_release_tx_bufs(struct netfront_info *np)
GNTMAP_readonly);
gnttab_release_grant_reference(&np->gref_tx_head,
np->grant_tx_ref[i]);
- np->grant_tx_ref[i] = GRANT_INVALID_REF;
+ np->grant_tx_ref[i] = GRANT_REF_INVALID;
add_id_to_freelist(np->tx_mbufs, i);
np->xn_cdata.xn_tx_chain_cnt--;
if (np->xn_cdata.xn_tx_chain_cnt < 0) {
@@ -854,7 +853,8 @@ refill:
sc->rx_mbufs[id] = m_new;
ref = gnttab_claim_grant_reference(&sc->gref_rx_head);
- KASSERT((short)ref >= 0, ("negative ref"));
+ KASSERT(ref != GNTTAB_LIST_END,
+ ("reserved grant references exhuasted"));
sc->grant_rx_ref[id] = ref;
vaddr = mtod(m_new, vm_offset_t);
@@ -1135,7 +1135,7 @@ xn_txeof(struct netfront_info *np)
np->grant_tx_ref[id]);
gnttab_release_grant_reference(
&np->gref_tx_head, np->grant_tx_ref[id]);
- np->grant_tx_ref[id] = GRANT_INVALID_REF;
+ np->grant_tx_ref[id] = GRANT_REF_INVALID;
np->tx_mbufs[id] = NULL;
add_id_to_freelist(np->tx_mbufs, id);
@@ -1318,12 +1318,13 @@ xennet_get_responses(struct netfront_info *np,
* the backend driver. In future this should flag the bad
* situation to the system controller to reboot the backed.
*/
- if (ref == GRANT_INVALID_REF) {
+ if (ref == GRANT_REF_INVALID) {
#if 0
if (net_ratelimit())
WPRINTK("Bad rx response id %d.\n", rx->id);
#endif
+ printf("%s: Bad rx response id %d.\n", __func__,rx->id);
err = EINVAL;
goto next;
}
@@ -1384,7 +1385,7 @@ next_skip_queue:
err = ENOENT;
printf("%s: cons %u frags %u rp %u, not enough frags\n",
__func__, *cons, frags, rp);
- break;
+ break;
}
/*
* Note that m can be NULL, if rx->status < 0 or if
@@ -1526,6 +1527,11 @@ xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head)
* tell the TCP stack to generate a shorter chain of packets.
*/
if (nfrags > MAX_TX_REQ_FRAGS) {
+#ifdef DEBUG
+ printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback "
+ "won't be able to handle it, dropping\n",
+ __func__, nfrags, MAX_TX_REQ_FRAGS);
+#endif
m_freem(m_head);
return (EMSGSIZE);
}
@@ -1881,11 +1887,11 @@ network_connect(struct netfront_info *np)
netif_rx_request_t *req;
u_int feature_rx_copy, feature_rx_flip;
- error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev),
+ error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
"feature-rx-copy", NULL, "%u", &feature_rx_copy);
if (error)
feature_rx_copy = 0;
- error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev),
+ error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
"feature-rx-flip", NULL, "%u", &feature_rx_flip);
if (error)
feature_rx_flip = 1;
@@ -1999,14 +2005,14 @@ create_netdev(device_t dev)
/* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
for (i = 0; i <= NET_TX_RING_SIZE; i++) {
np->tx_mbufs[i] = (void *) ((u_long) i+1);
- np->grant_tx_ref[i] = GRANT_INVALID_REF;
+ np->grant_tx_ref[i] = GRANT_REF_INVALID;
}
np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0;
for (i = 0; i <= NET_RX_RING_SIZE; i++) {
np->rx_mbufs[i] = NULL;
- np->grant_rx_ref[i] = GRANT_INVALID_REF;
+ np->grant_rx_ref[i] = GRANT_REF_INVALID;
}
/* A grant for every tx ring slot */
if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
@@ -2128,8 +2134,8 @@ netif_disconnect_backend(struct netfront_info *info)
end_access(info->tx_ring_ref, info->tx.sring);
end_access(info->rx_ring_ref, info->rx.sring);
- info->tx_ring_ref = GRANT_INVALID_REF;
- info->rx_ring_ref = GRANT_INVALID_REF;
+ info->tx_ring_ref = GRANT_REF_INVALID;
+ info->rx_ring_ref = GRANT_REF_INVALID;
info->tx.sring = NULL;
info->rx.sring = NULL;
@@ -2143,7 +2149,7 @@ netif_disconnect_backend(struct netfront_info *info)
static void
end_access(int ref, void *page)
{
- if (ref != GRANT_INVALID_REF)
+ if (ref != GRANT_REF_INVALID)
gnttab_end_foreign_access(ref, page);
}
@@ -2171,7 +2177,7 @@ static device_method_t netfront_methods[] = {
DEVMETHOD(device_resume, netfront_resume),
/* Xenbus interface */
- DEVMETHOD(xenbus_backend_changed, netfront_backend_changed),
+ DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed),
{ 0, 0 }
};
@@ -2183,4 +2189,4 @@ static driver_t netfront_driver = {
};
devclass_t netfront_devclass;
-DRIVER_MODULE(xe, xenbus, netfront_driver, netfront_devclass, 0, 0);
+DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, 0, 0);
diff --git a/sys/dev/xen/xenpci/evtchn.c b/sys/dev/xen/xenpci/evtchn.c
index bdf3ad1..ea53a7e 100644
--- a/sys/dev/xen/xenpci/evtchn.c
+++ b/sys/dev/xen/xenpci/evtchn.c
@@ -181,6 +181,49 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain,
return (0);
}
+int
+bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port, const char *devname, driver_intr_t handler,
+ void *arg, unsigned long irqflags, unsigned int *irqp)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ unsigned int irq;
+ int error;
+
+ irq = alloc_xen_irq();
+ if (irq < 0)
+ return irq;
+
+ mtx_lock(&irq_evtchn[irq].lock);
+
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+ error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+ if (error) {
+ mtx_unlock(&irq_evtchn[irq].lock);
+ free_xen_irq(irq);
+ return (-error);
+ }
+
+ irq_evtchn[irq].handler = handler;
+ irq_evtchn[irq].arg = arg;
+ irq_evtchn[irq].evtchn = bind_interdomain.local_port;
+ irq_evtchn[irq].close = 1;
+ irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0;
+
+ evtchn_to_irq[bind_interdomain.local_port] = irq;
+
+ unmask_evtchn(bind_interdomain.local_port);
+
+ mtx_unlock(&irq_evtchn[irq].lock);
+
+ if (irqp)
+ *irqp = irq;
+ return (0);
+}
+
+
int
bind_caller_port_to_irqhandler(unsigned int caller_port,
const char *devname, driver_intr_t handler, void *arg,
diff --git a/sys/dev/xen/xenpci/xenpci.c b/sys/dev/xen/xenpci/xenpci.c
index 2f2a79f..f4c9f73 100644
--- a/sys/dev/xen/xenpci/xenpci.c
+++ b/sys/dev/xen/xenpci/xenpci.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
char *hypercall_stubs;
shared_info_t *HYPERVISOR_shared_info;
static vm_paddr_t shared_info_pa;
+static device_t nexus;
/*
* This is used to find our platform device instance.
@@ -80,7 +81,7 @@ xenpci_cpuid_base(void)
{
uint32_t base, regs[4];
- for (base = 0x40000000; base < 0x40001000; base += 0x100) {
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
do_cpuid(base, regs);
if (!memcmp("XenVMMXenVMM", &regs[1], 12)
&& (regs[0] - base) >= 2)
@@ -204,14 +205,21 @@ xenpci_allocate_resources(device_t dev)
scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
&scp->rid_irq, RF_SHAREABLE|RF_ACTIVE);
- if (scp->res_irq == NULL)
+ if (scp->res_irq == NULL) {
+ printf("xenpci Could not allocate irq.\n");
goto errexit;
+ }
scp->rid_memory = PCIR_BAR(1);
scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
&scp->rid_memory, RF_ACTIVE);
- if (scp->res_memory == NULL)
+ if (scp->res_memory == NULL) {
+ printf("xenpci Could not allocate memory bar.\n");
goto errexit;
+ }
+
+ scp->phys_next = rman_get_start(scp->res_memory);
+
return (0);
errexit:
@@ -254,6 +262,36 @@ xenpci_alloc_space(size_t sz, vm_paddr_t *pa)
}
}
+static struct resource *
+xenpci_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ u_long start, u_long end, u_long count, u_int flags)
+{
+ return (BUS_ALLOC_RESOURCE(nexus, child, type, rid, start,
+ end, count, flags));
+}
+
+
+static int
+xenpci_release_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ return (BUS_RELEASE_RESOURCE(nexus, child, type, rid, r));
+}
+
+static int
+xenpci_activate_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ return (BUS_ACTIVATE_RESOURCE(nexus, child, type, rid, r));
+}
+
+static int
+xenpci_deactivate_resource(device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ return (BUS_DEACTIVATE_RESOURCE(nexus, child, type, rid, r));
+}
+
/*
* Called very early in the resume sequence - reinitialise the various
* bits of Xen machinery including the hypercall page and the shared
@@ -303,20 +341,36 @@ xenpci_probe(device_t dev)
static int
xenpci_attach(device_t dev)
{
- int error;
+ int error;
struct xenpci_softc *scp = device_get_softc(dev);
struct xen_add_to_physmap xatp;
vm_offset_t shared_va;
+ devclass_t dc;
+
+ /*
+ * Find and record nexus0. Since we are not really on the
+ * PCI bus, all resource operations are directed to nexus
+ * instead of through our parent.
+ */
+ if ((dc = devclass_find("nexus")) == 0
+ || (nexus = devclass_get_device(dc, 0)) == 0) {
+ device_printf(dev, "unable to find nexus.");
+ return (ENOENT);
+ }
error = xenpci_allocate_resources(dev);
- if (error)
+ if (error) {
+ device_printf(dev, "xenpci_allocate_resources failed(%d).\n",
+ error);
goto errexit;
-
- scp->phys_next = rman_get_start(scp->res_memory);
+ }
error = xenpci_init_hypercall_stubs(dev, scp);
- if (error)
+ if (error) {
+ device_printf(dev, "xenpci_init_hypercall_stubs failed(%d).\n",
+ error);
goto errexit;
+ }
setup_xen_features();
@@ -346,7 +400,7 @@ errexit:
* Undo anything we may have done.
*/
xenpci_deallocate_resources(dev);
- return (error);
+ return (error);
}
/*
@@ -364,8 +418,9 @@ xenpci_detach(device_t dev)
*/
if (scp->intr_cookie != NULL) {
if (BUS_TEARDOWN_INTR(parent, dev,
- scp->res_irq, scp->intr_cookie) != 0)
- printf("intr teardown failed.. continuing\n");
+ scp->res_irq, scp->intr_cookie) != 0)
+ device_printf(dev,
+ "intr teardown failed.. continuing\n");
scp->intr_cookie = NULL;
}
@@ -386,6 +441,10 @@ static device_method_t xenpci_methods[] = {
/* Bus interface */
DEVMETHOD(bus_add_child, bus_generic_add_child),
+ DEVMETHOD(bus_alloc_resource, xenpci_alloc_resource),
+ DEVMETHOD(bus_release_resource, xenpci_release_resource),
+ DEVMETHOD(bus_activate_resource, xenpci_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, xenpci_deactivate_resource),
{ 0, 0 }
};
diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 5e50d3e..0d35d1d 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -1933,7 +1933,15 @@ again:
vn_lock(vp,
LK_EXCLUSIVE |
LK_RETRY);
- r = VOP_LOOKUP(vp, &nvp, &cn);
+ if ((vp->v_vflag & VV_ROOT) != 0
+ && (cn.cn_flags & ISDOTDOT)
+ != 0) {
+ vref(vp);
+ nvp = vp;
+ r = 0;
+ } else
+ r = VOP_LOOKUP(vp, &nvp,
+ &cn);
}
}
if (!r) {
diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c
index 05402ce..3b9ffdd81b 100644
--- a/sys/geom/eli/g_eli.c
+++ b/sys/geom/eli/g_eli.c
@@ -106,7 +106,7 @@ struct g_class g_eli_class = {
/*
* Code paths:
* BIO_READ:
- * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
* BIO_WRITE:
* g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
*/
@@ -148,7 +148,7 @@ g_eli_crypto_rerun(struct cryptop *crp)
/*
* The function is called afer reading encrypted data from the provider.
*
- * g_eli_start -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
*/
void
g_eli_read_done(struct bio *bp)
@@ -167,6 +167,7 @@ g_eli_read_done(struct bio *bp)
if (pbp->bio_inbed < pbp->bio_children)
return;
g_destroy_bio(bp);
+ sc = pbp->bio_to->geom->softc;
if (pbp->bio_error != 0) {
G_ELI_LOGREQ(0, pbp, "%s() failed", __func__);
pbp->bio_completed = 0;
@@ -175,9 +176,9 @@ g_eli_read_done(struct bio *bp)
pbp->bio_driver2 = NULL;
}
g_io_deliver(pbp, pbp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
return;
}
- sc = pbp->bio_to->geom->softc;
mtx_lock(&sc->sc_queue_mtx);
bioq_insert_tail(&sc->sc_queue, pbp);
mtx_unlock(&sc->sc_queue_mtx);
@@ -192,6 +193,7 @@ g_eli_read_done(struct bio *bp)
void
g_eli_write_done(struct bio *bp)
{
+ struct g_eli_softc *sc;
struct bio *pbp;
G_ELI_LOGREQ(2, bp, "Request done.");
@@ -218,7 +220,9 @@ g_eli_write_done(struct bio *bp)
* Write is finished, send it up.
*/
pbp->bio_completed = pbp->bio_length;
+ sc = pbp->bio_to->geom->softc;
g_io_deliver(pbp, pbp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
}
/*
@@ -241,12 +245,14 @@ g_eli_orphan(struct g_consumer *cp)
sc = cp->geom->softc;
if (sc == NULL)
return;
- g_eli_destroy(sc, 1);
+ g_eli_destroy(sc, TRUE);
}
/*
- * BIO_READ : G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
- * BIO_WRITE: G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
+ * BIO_READ:
+ * G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * BIO_WRITE:
+ * G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
*/
static void
g_eli_start(struct bio *bp)
@@ -282,24 +288,16 @@ g_eli_start(struct bio *bp)
g_io_deliver(bp, ENOMEM);
return;
}
+ bp->bio_driver1 = cbp;
+ bp->bio_pflags = G_ELI_NEW_BIO;
switch (bp->bio_cmd) {
case BIO_READ:
if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) {
- bp->bio_driver2 = NULL;
- cbp->bio_done = g_eli_read_done;
- cp = LIST_FIRST(&sc->sc_geom->consumer);
- cbp->bio_to = cp->provider;
- G_ELI_LOGREQ(2, cbp, "Sending request.");
- /*
- * Read encrypted data from provider.
- */
- g_io_request(cbp, cp);
+ g_eli_crypto_read(sc, bp, 0);
break;
}
- bp->bio_pflags = 255;
/* FALLTHROUGH */
case BIO_WRITE:
- bp->bio_driver1 = cbp;
mtx_lock(&sc->sc_queue_mtx);
bioq_insert_tail(&sc->sc_queue, bp);
mtx_unlock(&sc->sc_queue_mtx);
@@ -316,6 +314,104 @@ g_eli_start(struct bio *bp)
}
}
+static int
+g_eli_newsession(struct g_eli_worker *wr)
+{
+ struct g_eli_softc *sc;
+ struct cryptoini crie, cria;
+ int error;
+
+ sc = wr->w_softc;
+
+ bzero(&crie, sizeof(crie));
+ crie.cri_alg = sc->sc_ealgo;
+ crie.cri_klen = sc->sc_ekeylen;
+ if (sc->sc_ealgo == CRYPTO_AES_XTS)
+ crie.cri_klen <<= 1;
+ crie.cri_key = sc->sc_ekeys[0];
+ if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+ bzero(&cria, sizeof(cria));
+ cria.cri_alg = sc->sc_aalgo;
+ cria.cri_klen = sc->sc_akeylen;
+ cria.cri_key = sc->sc_akey;
+ crie.cri_next = &cria;
+ }
+
+ switch (sc->sc_crypto) {
+ case G_ELI_CRYPTO_SW:
+ error = crypto_newsession(&wr->w_sid, &crie,
+ CRYPTOCAP_F_SOFTWARE);
+ break;
+ case G_ELI_CRYPTO_HW:
+ error = crypto_newsession(&wr->w_sid, &crie,
+ CRYPTOCAP_F_HARDWARE);
+ break;
+ case G_ELI_CRYPTO_UNKNOWN:
+ error = crypto_newsession(&wr->w_sid, &crie,
+ CRYPTOCAP_F_HARDWARE);
+ if (error == 0) {
+ mtx_lock(&sc->sc_queue_mtx);
+ if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
+ sc->sc_crypto = G_ELI_CRYPTO_HW;
+ mtx_unlock(&sc->sc_queue_mtx);
+ } else {
+ error = crypto_newsession(&wr->w_sid, &crie,
+ CRYPTOCAP_F_SOFTWARE);
+ mtx_lock(&sc->sc_queue_mtx);
+ if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
+ sc->sc_crypto = G_ELI_CRYPTO_SW;
+ mtx_unlock(&sc->sc_queue_mtx);
+ }
+ break;
+ default:
+ panic("%s: invalid condition", __func__);
+ }
+
+ return (error);
+}
+
+static void
+g_eli_freesession(struct g_eli_worker *wr)
+{
+
+ crypto_freesession(wr->w_sid);
+}
+
+static void
+g_eli_cancel(struct g_eli_softc *sc)
+{
+ struct bio *bp;
+
+ mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
+
+ while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
+ KASSERT(bp->bio_pflags == G_ELI_NEW_BIO,
+ ("Not new bio when canceling (bp=%p).", bp));
+ g_io_deliver(bp, ENXIO);
+ }
+}
+
+static struct bio *
+g_eli_takefirst(struct g_eli_softc *sc)
+{
+ struct bio *bp;
+
+ mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
+
+ if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
+ return (bioq_takefirst(&sc->sc_queue));
+ /*
+ * Device suspended, so we skip new I/O requests.
+ */
+ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
+ if (bp->bio_pflags != G_ELI_NEW_BIO)
+ break;
+ }
+ if (bp != NULL)
+ bioq_remove(&sc->sc_queue, bp);
+ return (bp);
+}
+
/*
* This is the main function for kernel worker thread when we don't have
* hardware acceleration and we have to do cryptography in software.
@@ -328,6 +424,7 @@ g_eli_worker(void *arg)
struct g_eli_softc *sc;
struct g_eli_worker *wr;
struct bio *bp;
+ int error;
wr = arg;
sc = wr->w_softc;
@@ -349,11 +446,13 @@ g_eli_worker(void *arg)
for (;;) {
mtx_lock(&sc->sc_queue_mtx);
- bp = bioq_takefirst(&sc->sc_queue);
+again:
+ bp = g_eli_takefirst(sc);
if (bp == NULL) {
if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
+ g_eli_cancel(sc);
LIST_REMOVE(wr, w_next);
- crypto_freesession(wr->w_sid);
+ g_eli_freesession(wr);
free(wr, M_ELI);
G_ELI_DEBUG(1, "Thread %s exiting.",
curthread->td_proc->p_comm);
@@ -361,16 +460,63 @@ g_eli_worker(void *arg)
mtx_unlock(&sc->sc_queue_mtx);
kproc_exit(0);
}
+ while (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+ if (sc->sc_inflight > 0) {
+ G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight);
+ /*
+ * We still have inflight BIOs, so
+ * sleep and retry.
+ */
+ msleep(sc, &sc->sc_queue_mtx, PRIBIO,
+ "geli:inf", hz / 5);
+ goto again;
+ }
+ /*
+ * Suspend requested, mark the worker as
+ * suspended and go to sleep.
+ */
+ if (wr->w_active) {
+ g_eli_freesession(wr);
+ wr->w_active = FALSE;
+ }
+ wakeup(&sc->sc_workers);
+ msleep(sc, &sc->sc_queue_mtx, PRIBIO,
+ "geli:suspend", 0);
+ if (!wr->w_active &&
+ !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
+ error = g_eli_newsession(wr);
+ KASSERT(error == 0,
+ ("g_eli_newsession() failed on resume (error=%d)",
+ error));
+ wr->w_active = TRUE;
+ }
+ goto again;
+ }
msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
continue;
}
+ if (bp->bio_pflags == G_ELI_NEW_BIO)
+ atomic_add_int(&sc->sc_inflight, 1);
mtx_unlock(&sc->sc_queue_mtx);
- if (bp->bio_cmd == BIO_READ && bp->bio_pflags == 255)
- g_eli_auth_read(sc, bp);
- else if (sc->sc_flags & G_ELI_FLAG_AUTH)
- g_eli_auth_run(wr, bp);
- else
- g_eli_crypto_run(wr, bp);
+ if (bp->bio_pflags == G_ELI_NEW_BIO) {
+ bp->bio_pflags = 0;
+ if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+ if (bp->bio_cmd == BIO_READ)
+ g_eli_auth_read(sc, bp);
+ else
+ g_eli_auth_run(wr, bp);
+ } else {
+ if (bp->bio_cmd == BIO_READ)
+ g_eli_crypto_read(sc, bp, 1);
+ else
+ g_eli_crypto_run(wr, bp);
+ }
+ } else {
+ if (sc->sc_flags & G_ELI_FLAG_AUTH)
+ g_eli_auth_run(wr, bp);
+ else
+ g_eli_crypto_run(wr, bp);
+ }
}
}
@@ -500,7 +646,7 @@ g_eli_last_close(struct g_eli_softc *sc)
gp = sc->sc_geom;
pp = LIST_FIRST(&gp->provider);
strlcpy(ppname, pp->name, sizeof(ppname));
- error = g_eli_destroy(sc, 1);
+ error = g_eli_destroy(sc, TRUE);
KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).",
ppname, error));
G_ELI_DEBUG(0, "Detached %s on last close.", ppname);
@@ -557,7 +703,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
struct g_geom *gp;
struct g_provider *pp;
struct g_consumer *cp;
- struct cryptoini crie, cria;
u_int i, threads;
int error;
@@ -584,7 +729,8 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
else
gp->access = g_std_access;
- sc->sc_crypto = G_ELI_CRYPTO_SW;
+ sc->sc_inflight = 0;
+ sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN;
sc->sc_flags = md->md_flags;
/* Backward compatibility. */
if (md->md_version < 4)
@@ -612,14 +758,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
sc->sc_bytes_per_sector =
(md->md_sectorsize - 1) / sc->sc_data_per_sector + 1;
sc->sc_bytes_per_sector *= bpp->sectorsize;
- /*
- * Precalculate SHA256 for HMAC key generation.
- * This is expensive operation and we can do it only once now or
- * for every access to sector, so now will be much better.
- */
- SHA256_Init(&sc->sc_akeyctx);
- SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
- sizeof(sc->sc_akey));
}
gp->softc = sc;
@@ -679,7 +817,16 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
*/
g_eli_mkey_propagate(sc, mkey);
sc->sc_ekeylen = md->md_keylen;
-
+ if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+ /*
+ * Precalculate SHA256 for HMAC key generation.
+ * This is expensive operation and we can do it only once now or
+ * for every access to sector, so now will be much better.
+ */
+ SHA256_Init(&sc->sc_akeyctx);
+ SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
+ sizeof(sc->sc_akey));
+ }
/*
* Precalculate SHA256 for IV generation.
* This is expensive operation and we can do it only once now or for
@@ -697,20 +844,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
LIST_INIT(&sc->sc_workers);
- bzero(&crie, sizeof(crie));
- crie.cri_alg = sc->sc_ealgo;
- crie.cri_klen = sc->sc_ekeylen;
- if (sc->sc_ealgo == CRYPTO_AES_XTS)
- crie.cri_klen <<= 1;
- crie.cri_key = sc->sc_ekeys[0];
- if (sc->sc_flags & G_ELI_FLAG_AUTH) {
- bzero(&cria, sizeof(cria));
- cria.cri_alg = sc->sc_aalgo;
- cria.cri_klen = sc->sc_akeylen;
- cria.cri_key = sc->sc_akey;
- crie.cri_next = &cria;
- }
-
threads = g_eli_threads;
if (threads == 0)
threads = mp_ncpus;
@@ -728,21 +861,9 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);
wr->w_softc = sc;
wr->w_number = i;
+ wr->w_active = TRUE;
- /*
- * If this is the first pass, try to get hardware support.
- * Use software cryptography, if we cannot get it.
- */
- if (LIST_EMPTY(&sc->sc_workers)) {
- error = crypto_newsession(&wr->w_sid, &crie,
- CRYPTOCAP_F_HARDWARE);
- if (error == 0)
- sc->sc_crypto = G_ELI_CRYPTO_HW;
- }
- if (sc->sc_crypto == G_ELI_CRYPTO_SW) {
- error = crypto_newsession(&wr->w_sid, &crie,
- CRYPTOCAP_F_SOFTWARE);
- }
+ error = g_eli_newsession(wr);
if (error != 0) {
free(wr, M_ELI);
if (req != NULL) {
@@ -758,7 +879,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0,
"g_eli[%u] %s", i, bpp->name);
if (error != 0) {
- crypto_freesession(wr->w_sid);
+ g_eli_freesession(wr);
free(wr, M_ELI);
if (req != NULL) {
gctl_error(req, "Cannot create kernel thread "
@@ -875,7 +996,7 @@ g_eli_destroy_geom(struct gctl_req *req __unused,
struct g_eli_softc *sc;
sc = gp->softc;
- return (g_eli_destroy(sc, 0));
+ return (g_eli_destroy(sc, FALSE));
}
static int
@@ -1106,6 +1227,7 @@ g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
sbuf_printf(sb, name); \
} \
} while (0)
+ ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND");
ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY");
ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER");
ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME");
@@ -1167,7 +1289,7 @@ g_eli_shutdown_pre_sync(void *arg, int howto)
pp = LIST_FIRST(&gp->provider);
KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name));
if (pp->acr + pp->acw + pp->ace == 0)
- error = g_eli_destroy(sc, 1);
+ error = g_eli_destroy(sc, TRUE);
else {
sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
gp->access = g_eli_access;
diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h
index e6b311d..d67e436 100644
--- a/sys/geom/eli/g_eli.h
+++ b/sys/geom/eli/g_eli.h
@@ -86,6 +86,10 @@
#define G_ELI_FLAG_NATIVE_BYTE_ORDER 0x00040000
/* Provider uses single encryption key. */
#define G_ELI_FLAG_SINGLE_KEY 0x00080000
+/* Device suspended. */
+#define G_ELI_FLAG_SUSPEND 0x00100000
+
+#define G_ELI_NEW_BIO 255
#define SHA512_MDLEN 64
#define G_ELI_AUTH_SECKEYLEN SHA256_DIGEST_LENGTH
@@ -109,6 +113,7 @@ extern int g_eli_debug;
extern u_int g_eli_overwrites;
extern u_int g_eli_batch;
+#define G_ELI_CRYPTO_UNKNOWN 0
#define G_ELI_CRYPTO_HW 1
#define G_ELI_CRYPTO_SW 2
@@ -140,6 +145,7 @@ struct g_eli_worker {
struct proc *w_proc;
u_int w_number;
uint64_t w_sid;
+ boolean_t w_active;
LIST_ENTRY(g_eli_worker) w_next;
};
@@ -160,6 +166,7 @@ struct g_eli_softc {
SHA256_CTX sc_ivctx;
int sc_nkey;
uint32_t sc_flags;
+ int sc_inflight;
off_t sc_mediasize;
size_t sc_sectorsize;
u_int sc_bytes_per_sector;
@@ -499,6 +506,7 @@ uint8_t *g_eli_crypto_key(struct g_eli_softc *sc, off_t offset,
void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv,
size_t size);
+void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker);
void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp);
void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp);
diff --git a/sys/geom/eli/g_eli_ctl.c b/sys/geom/eli/g_eli_ctl.c
index 02ede13..7147b27 100644
--- a/sys/geom/eli/g_eli_ctl.c
+++ b/sys/geom/eli/g_eli_ctl.c
@@ -217,7 +217,7 @@ g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp)
sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
sc->sc_geom->access = g_eli_access;
} else {
- error = g_eli_destroy(sc, *force);
+ error = g_eli_destroy(sc, *force ? TRUE : FALSE);
if (error != 0) {
gctl_error(req,
"Cannot destroy device %s (error=%d).",
@@ -700,6 +700,213 @@ g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp)
}
static int
+g_eli_suspend_one(struct g_eli_softc *sc)
+{
+ struct g_eli_worker *wr;
+
+ g_topology_assert();
+
+ if (sc == NULL)
+ return (ENOENT);
+ if (sc->sc_flags & G_ELI_FLAG_ONETIME)
+ return (EOPNOTSUPP);
+
+ mtx_lock(&sc->sc_queue_mtx);
+ if (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+ mtx_unlock(&sc->sc_queue_mtx);
+ return (EALREADY);
+ }
+ sc->sc_flags |= G_ELI_FLAG_SUSPEND;
+ wakeup(sc);
+ for (;;) {
+ LIST_FOREACH(wr, &sc->sc_workers, w_next) {
+ if (wr->w_active)
+ break;
+ }
+ if (wr == NULL)
+ break;
+ /* Not all threads suspended. */
+ msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
+ "geli:suspend", 0);
+ }
+ /*
+ * Clear sensitive data on suspend, they will be recovered on resume.
+ */
+ bzero(sc->sc_mkey, sizeof(sc->sc_mkey));
+ bzero(sc->sc_ekeys,
+ sc->sc_nekeys * (sizeof(uint8_t *) + G_ELI_DATAKEYLEN));
+ free(sc->sc_ekeys, M_ELI);
+ sc->sc_ekeys = NULL;
+ bzero(sc->sc_akey, sizeof(sc->sc_akey));
+ bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx));
+ bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey));
+ bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx));
+ mtx_unlock(&sc->sc_queue_mtx);
+ G_ELI_DEBUG(0, "%s has been suspended.", sc->sc_name);
+ return (0);
+}
+
+static void
+g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp)
+{
+ struct g_eli_softc *sc;
+ int *all, *nargs;
+ int error;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ all = gctl_get_paraml(req, "all", sizeof(*all));
+ if (all == NULL) {
+ gctl_error(req, "No '%s' argument.", "all");
+ return;
+ }
+ if (!*all && *nargs == 0) {
+ gctl_error(req, "Too few arguments.");
+ return;
+ }
+
+ if (*all) {
+ struct g_geom *gp, *gp2;
+
+ LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
+ sc = gp->softc;
+ if (sc->sc_flags & G_ELI_FLAG_ONETIME)
+ continue;
+ error = g_eli_suspend_one(sc);
+ if (error != 0)
+ gctl_error(req, "Not fully done.");
+ }
+ } else {
+ const char *prov;
+ char param[16];
+ int i;
+
+ for (i = 0; i < *nargs; i++) {
+ snprintf(param, sizeof(param), "arg%d", i);
+ prov = gctl_get_asciiparam(req, param);
+ if (prov == NULL) {
+ G_ELI_DEBUG(0, "No 'arg%d' argument.", i);
+ continue;
+ }
+
+ sc = g_eli_find_device(mp, prov);
+ if (sc == NULL) {
+ G_ELI_DEBUG(0, "No such provider: %s.", prov);
+ continue;
+ }
+ error = g_eli_suspend_one(sc);
+ if (error != 0)
+ gctl_error(req, "Not fully done.");
+ }
+ }
+}
+
+static void
+g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp)
+{
+ struct g_eli_metadata md;
+ struct g_eli_softc *sc;
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ const char *name;
+ u_char *key, mkey[G_ELI_DATAIVKEYLEN];
+ int *nargs, keysize, error;
+ u_int nkey;
+
+ g_topology_assert();
+
+ nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+ if (nargs == NULL) {
+ gctl_error(req, "No '%s' argument.", "nargs");
+ return;
+ }
+ if (*nargs != 1) {
+ gctl_error(req, "Invalid number of arguments.");
+ return;
+ }
+
+ name = gctl_get_asciiparam(req, "arg0");
+ if (name == NULL) {
+ gctl_error(req, "No 'arg%u' argument.", 0);
+ return;
+ }
+ sc = g_eli_find_device(mp, name);
+ if (sc == NULL) {
+ gctl_error(req, "Provider %s is invalid.", name);
+ return;
+ }
+ if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
+ gctl_error(req, "Provider %s not suspended.", name);
+ return;
+ }
+ cp = LIST_FIRST(&sc->sc_geom->consumer);
+ pp = cp->provider;
+ error = g_eli_read_metadata(mp, pp, &md);
+ if (error != 0) {
+ gctl_error(req, "Cannot read metadata from %s (error=%d).",
+ name, error);
+ return;
+ }
+ if (md.md_keys == 0x00) {
+ bzero(&md, sizeof(md));
+ gctl_error(req, "No valid keys on %s.", pp->name);
+ return;
+ }
+
+ key = gctl_get_param(req, "key", &keysize);
+ if (key == NULL || keysize != G_ELI_USERKEYLEN) {
+ bzero(&md, sizeof(md));
+ gctl_error(req, "No '%s' argument.", "key");
+ return;
+ }
+
+ error = g_eli_mkey_decrypt(&md, key, mkey, &nkey);
+ bzero(key, keysize);
+ if (error == -1) {
+ bzero(&md, sizeof(md));
+ gctl_error(req, "Wrong key for %s.", pp->name);
+ return;
+ } else if (error > 0) {
+ bzero(&md, sizeof(md));
+ gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).",
+ pp->name, error);
+ return;
+ }
+ G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name);
+
+ mtx_lock(&sc->sc_queue_mtx);
+ /* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */
+ g_eli_mkey_propagate(sc, mkey);
+ bzero(mkey, sizeof(mkey));
+ bzero(&md, sizeof(md));
+ /* Restore sc_akeyctx. */
+ if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+ SHA256_Init(&sc->sc_akeyctx);
+ SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
+ sizeof(sc->sc_akey));
+ }
+ /* Restore sc_ivctx. */
+ switch (sc->sc_ealgo) {
+ case CRYPTO_AES_XTS:
+ break;
+ default:
+ SHA256_Init(&sc->sc_ivctx);
+ SHA256_Update(&sc->sc_ivctx, sc->sc_ivkey,
+ sizeof(sc->sc_ivkey));
+ break;
+ }
+ sc->sc_flags &= ~G_ELI_FLAG_SUSPEND;
+ mtx_unlock(&sc->sc_queue_mtx);
+ G_ELI_DEBUG(1, "Resumed %s.", pp->name);
+ wakeup(sc);
+}
+
+static int
g_eli_kill_one(struct g_eli_softc *sc)
{
struct g_provider *pp;
@@ -749,7 +956,7 @@ g_eli_kill_one(struct g_eli_softc *sc)
}
if (error == 0)
G_ELI_DEBUG(0, "%s has been killed.", pp->name);
- g_eli_destroy(sc, 1);
+ g_eli_destroy(sc, TRUE);
return (error);
}
@@ -839,6 +1046,10 @@ g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb)
g_eli_ctl_setkey(req, mp);
else if (strcmp(verb, "delkey") == 0)
g_eli_ctl_delkey(req, mp);
+ else if (strcmp(verb, "suspend") == 0)
+ g_eli_ctl_suspend(req, mp);
+ else if (strcmp(verb, "resume") == 0)
+ g_eli_ctl_resume(req, mp);
else if (strcmp(verb, "kill") == 0)
g_eli_ctl_kill(req, mp);
else
diff --git a/sys/geom/eli/g_eli_integrity.c b/sys/geom/eli/g_eli_integrity.c
index bafce96..24586bd 100644
--- a/sys/geom/eli/g_eli_integrity.c
+++ b/sys/geom/eli/g_eli_integrity.c
@@ -129,6 +129,7 @@ g_eli_auth_keygen(struct g_eli_softc *sc, off_t offset, u_char *key)
static int
g_eli_auth_read_done(struct cryptop *crp)
{
+ struct g_eli_softc *sc;
struct bio *bp;
if (crp->crp_etype == EAGAIN) {
@@ -152,8 +153,8 @@ g_eli_auth_read_done(struct cryptop *crp)
*/
if (bp->bio_inbed < bp->bio_children)
return (0);
+ sc = bp->bio_to->geom->softc;
if (bp->bio_error == 0) {
- struct g_eli_softc *sc;
u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize;
u_char *srcdata, *dstdata, *auth;
off_t coroff, corsize;
@@ -161,7 +162,6 @@ g_eli_auth_read_done(struct cryptop *crp)
/*
* Verify data integrity based on calculated and read HMACs.
*/
- sc = bp->bio_to->geom->softc;
/* Sectorsize of decrypted provider eg. 4096. */
decr_secsize = bp->bio_to->sectorsize;
/* The real sectorsize of encrypted provider, eg. 512. */
@@ -240,6 +240,7 @@ g_eli_auth_read_done(struct cryptop *crp)
* Read is finished, send it up.
*/
g_io_deliver(bp, bp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
return (0);
}
@@ -276,6 +277,7 @@ g_eli_auth_write_done(struct cryptop *crp)
*/
if (bp->bio_inbed < bp->bio_children)
return (0);
+ sc = bp->bio_to->geom->softc;
if (bp->bio_error != 0) {
G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).",
bp->bio_error);
@@ -285,9 +287,9 @@ g_eli_auth_write_done(struct cryptop *crp)
bp->bio_driver1 = NULL;
g_destroy_bio(cbp);
g_io_deliver(bp, bp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
return (0);
}
- sc = bp->bio_to->geom->softc;
cp = LIST_FIRST(&sc->sc_geom->consumer);
cbp = bp->bio_driver1;
bp->bio_driver1 = NULL;
@@ -392,6 +394,11 @@ g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp)
/*
* This is the main function responsible for cryptography (ie. communication
* with crypto(9) subsystem).
+ *
+ * BIO_READ:
+ * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver
+ * BIO_WRITE:
+ * g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
*/
void
g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp)
diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c
index a6f572b..ee133c6 100644
--- a/sys/geom/eli/g_eli_privacy.c
+++ b/sys/geom/eli/g_eli_privacy.c
@@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$");
/*
* Code paths:
* BIO_READ:
- * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
* BIO_WRITE:
* g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
*/
@@ -63,11 +63,12 @@ MALLOC_DECLARE(M_ELI);
/*
* The function is called after we read and decrypt data.
*
- * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver
*/
static int
g_eli_crypto_read_done(struct cryptop *crp)
{
+ struct g_eli_softc *sc;
struct bio *bp;
if (crp->crp_etype == EAGAIN) {
@@ -101,7 +102,9 @@ g_eli_crypto_read_done(struct cryptop *crp)
/*
* Read is finished, send it up.
*/
+ sc = bp->bio_to->geom->softc;
g_io_deliver(bp, bp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
return (0);
}
@@ -113,6 +116,7 @@ g_eli_crypto_read_done(struct cryptop *crp)
static int
g_eli_crypto_write_done(struct cryptop *crp)
{
+ struct g_eli_softc *sc;
struct g_geom *gp;
struct g_consumer *cp;
struct bio *bp, *cbp;
@@ -141,18 +145,20 @@ g_eli_crypto_write_done(struct cryptop *crp)
bp->bio_children = 1;
cbp = bp->bio_driver1;
bp->bio_driver1 = NULL;
+ gp = bp->bio_to->geom;
if (bp->bio_error != 0) {
G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).",
bp->bio_error);
free(bp->bio_driver2, M_ELI);
bp->bio_driver2 = NULL;
g_destroy_bio(cbp);
+ sc = gp->softc;
g_io_deliver(bp, bp->bio_error);
+ atomic_subtract_int(&sc->sc_inflight, 1);
return (0);
}
cbp->bio_data = bp->bio_driver2;
cbp->bio_done = g_eli_write_done;
- gp = bp->bio_to->geom;
cp = LIST_FIRST(&gp->consumer);
cbp->bio_to = cp->provider;
G_ELI_LOGREQ(2, cbp, "Sending request.");
@@ -164,8 +170,57 @@ g_eli_crypto_write_done(struct cryptop *crp)
}
/*
+ * The function is called to read encrypted data.
+ *
+ * g_eli_start -> G_ELI_CRYPTO_READ -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ */
+void
+g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker)
+{
+ struct g_consumer *cp;
+ struct bio *cbp;
+
+ if (!fromworker) {
+ /*
+ * We are not called from the worker thread, so check if
+ * device is suspended.
+ */
+ mtx_lock(&sc->sc_queue_mtx);
+ if (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+ /*
+ * If device is suspended, we place the request onto
+ * the queue, so it can be handled after resume.
+ */
+ G_ELI_DEBUG(0, "device suspended, move onto queue");
+ bioq_insert_tail(&sc->sc_queue, bp);
+ mtx_unlock(&sc->sc_queue_mtx);
+ wakeup(sc);
+ return;
+ }
+ atomic_add_int(&sc->sc_inflight, 1);
+ mtx_unlock(&sc->sc_queue_mtx);
+ }
+ bp->bio_pflags = 0;
+ bp->bio_driver2 = NULL;
+ cbp = bp->bio_driver1;
+ cbp->bio_done = g_eli_read_done;
+ cp = LIST_FIRST(&sc->sc_geom->consumer);
+ cbp->bio_to = cp->provider;
+ G_ELI_LOGREQ(2, cbp, "Sending request.");
+ /*
+ * Read encrypted data from provider.
+ */
+ g_io_request(cbp, cp);
+}
+
+/*
* This is the main function responsible for cryptography (ie. communication
* with crypto(9) subsystem).
+ *
+ * BIO_READ:
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver
+ * BIO_WRITE:
+ * g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
*/
void
g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp)
diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c
index 060fad5..542f4df 100644
--- a/sys/i386/xen/xen_machdep.c
+++ b/sys/i386/xen/xen_machdep.c
@@ -722,7 +722,9 @@ char *bootmem_start, *bootmem_current, *bootmem_end;
pteinfo_t *pteinfo_list;
void initvalues(start_info_t *startinfo);
-struct ringbuf_head *xen_store; /* XXX move me */
+struct xenstore_domain_interface;
+extern struct xenstore_domain_interface *xen_store;
+
char *console_page;
void *
@@ -1082,7 +1084,7 @@ initvalues(start_info_t *startinfo)
HYPERVISOR_shared_info = (shared_info_t *)cur_space;
cur_space += PAGE_SIZE;
- xen_store = (struct ringbuf_head *)cur_space;
+ xen_store = (struct xenstore_domain_interface *)cur_space;
cur_space += PAGE_SIZE;
console_page = (char *)cur_space;
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 3c05530..1e4d690 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -655,16 +655,8 @@ interpret:
setsugid(p);
#ifdef KTRACE
- if (p->p_tracevp != NULL &&
- priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
- mtx_lock(&ktrace_mtx);
- p->p_traceflag = 0;
- tracevp = p->p_tracevp;
- p->p_tracevp = NULL;
- tracecred = p->p_tracecred;
- p->p_tracecred = NULL;
- mtx_unlock(&ktrace_mtx);
- }
+ if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
+ ktrprocexec(p, &tracecred, &tracevp);
#endif
/*
* Close any file descriptors 0..2 that reference procfs,
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 029f1c3..31389e1 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -121,10 +121,6 @@ exit1(struct thread *td, int rv)
struct proc *p, *nq, *q;
struct vnode *vtmp;
struct vnode *ttyvp = NULL;
-#ifdef KTRACE
- struct vnode *tracevp;
- struct ucred *tracecred;
-#endif
struct plimit *plim;
int locked;
@@ -356,33 +352,7 @@ exit1(struct thread *td, int rv)
if (ttyvp != NULL)
vrele(ttyvp);
#ifdef KTRACE
- /*
- * Disable tracing, then drain any pending records and release
- * the trace file.
- */
- if (p->p_traceflag != 0) {
- PROC_LOCK(p);
- mtx_lock(&ktrace_mtx);
- p->p_traceflag = 0;
- mtx_unlock(&ktrace_mtx);
- PROC_UNLOCK(p);
- ktrprocexit(td);
- PROC_LOCK(p);
- mtx_lock(&ktrace_mtx);
- tracevp = p->p_tracevp;
- p->p_tracevp = NULL;
- tracecred = p->p_tracecred;
- p->p_tracecred = NULL;
- mtx_unlock(&ktrace_mtx);
- PROC_UNLOCK(p);
- if (tracevp != NULL) {
- locked = VFS_LOCK_GIANT(tracevp->v_mount);
- vrele(tracevp);
- VFS_UNLOCK_GIANT(locked);
- }
- if (tracecred != NULL)
- crfree(tracecred);
- }
+ ktrprocexit(td);
#endif
/*
* Release reference to text vnode
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index da2c415..126c668 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -645,21 +645,7 @@ again:
callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
#ifdef KTRACE
- /*
- * Copy traceflag and tracefile if enabled.
- */
- mtx_lock(&ktrace_mtx);
- KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
- if (p1->p_traceflag & KTRFAC_INHERIT) {
- p2->p_traceflag = p1->p_traceflag;
- if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
- VREF(p2->p_tracevp);
- KASSERT(p1->p_tracecred != NULL,
- ("ktrace vnode with no cred"));
- p2->p_tracecred = crhold(p1->p_tracecred);
- }
- }
- mtx_unlock(&ktrace_mtx);
+ ktrprocfork(p1, p2);
#endif
/*
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index bf530e1..6e2285b 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -126,7 +126,7 @@ SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
0, "Maximum size of genio event payload");
static int print_message = 1;
-struct mtx ktrace_mtx;
+static struct mtx ktrace_mtx;
static struct sx ktrace_sx;
static void ktrace_init(void *dummy);
@@ -134,7 +134,10 @@ static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
static u_int ktrace_resize_pool(u_int newsize);
static struct ktr_request *ktr_getrequest(int type);
static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
+static void ktr_freeproc(struct proc *p, struct ucred **uc,
+ struct vnode **vp);
static void ktr_freerequest(struct ktr_request *req);
+static void ktr_freerequest_locked(struct ktr_request *req);
static void ktr_writerequest(struct thread *td, struct ktr_request *req);
static int ktrcanset(struct thread *,struct proc *);
static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
@@ -375,11 +378,43 @@ static void
ktr_freerequest(struct ktr_request *req)
{
+ mtx_lock(&ktrace_mtx);
+ ktr_freerequest_locked(req);
+ mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_freerequest_locked(struct ktr_request *req)
+{
+
+ mtx_assert(&ktrace_mtx, MA_OWNED);
if (req->ktr_buffer != NULL)
free(req->ktr_buffer, M_KTRACE);
- mtx_lock(&ktrace_mtx);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
- mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * Disable tracing for a process and release all associated resources.
+ * The caller is responsible for releasing a reference on the returned
+ * vnode and credentials.
+ */
+static void
+ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+ struct ktr_request *req;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&ktrace_mtx, MA_OWNED);
+ *uc = p->p_tracecred;
+ p->p_tracecred = NULL;
+ if (vp != NULL)
+ *vp = p->p_tracevp;
+ p->p_tracevp = NULL;
+ p->p_traceflag = 0;
+ while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
+ STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
+ ktr_freerequest_locked(req);
+ }
}
void
@@ -432,20 +467,79 @@ ktrsysret(code, error, retval)
}
/*
- * When a process exits, drain per-process asynchronous trace records.
+ * When a setuid process execs, disable tracing.
+ *
+ * XXX: We toss any pending asynchronous records.
+ */
+void
+ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, uc, vp);
+ mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records
+ * and disable tracing.
*/
void
ktrprocexit(struct thread *td)
{
+ struct proc *p;
+ struct ucred *cred;
+ struct vnode *vp;
+ int vfslocked;
+
+ p = td->td_proc;
+ if (p->p_traceflag == 0)
+ return;
ktrace_enter(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
+ PROC_LOCK(p);
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, &vp);
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ if (vp != NULL) {
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vrele(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ if (cred != NULL)
+ crfree(cred);
ktrace_exit(td);
}
/*
+ * When a process forks, enable tracing in the new process if needed.
+ */
+void
+ktrprocfork(struct proc *p1, struct proc *p2)
+{
+
+ PROC_LOCK_ASSERT(p1, MA_OWNED);
+ PROC_LOCK_ASSERT(p2, MA_OWNED);
+ mtx_lock(&ktrace_mtx);
+ KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
+ if (p1->p_traceflag & KTRFAC_INHERIT) {
+ p2->p_traceflag = p1->p_traceflag;
+ if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
+ VREF(p2->p_tracevp);
+ KASSERT(p1->p_tracecred != NULL,
+ ("ktrace vnode with no cred"));
+ p2->p_tracecred = crhold(p1->p_tracecred);
+ }
+ }
+ mtx_unlock(&ktrace_mtx);
+}
+
+/*
* When a thread returns, drain any asynchronous records generated by the
* system call.
*/
@@ -694,10 +788,7 @@ ktrace(td, uap)
if (p->p_tracevp == vp) {
if (ktrcanset(td, p)) {
mtx_lock(&ktrace_mtx);
- cred = p->p_tracecred;
- p->p_tracecred = NULL;
- p->p_tracevp = NULL;
- p->p_traceflag = 0;
+ ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
crfree(cred);
@@ -864,14 +955,9 @@ ktrops(td, p, ops, facs, vp)
p->p_traceflag |= KTRFAC_ROOT;
} else {
/* KTROP_CLEAR */
- if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
/* no more tracing */
- p->p_traceflag = 0;
- tracevp = p->p_tracevp;
- p->p_tracevp = NULL;
- tracecred = p->p_tracecred;
- p->p_tracecred = NULL;
- }
+ ktr_freeproc(p, &tracecred, &tracevp);
}
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p);
@@ -1036,10 +1122,7 @@ ktr_writerequest(struct thread *td, struct ktr_request *req)
PROC_LOCK(p);
if (p->p_tracevp == vp) {
mtx_lock(&ktrace_mtx);
- p->p_tracevp = NULL;
- p->p_traceflag = 0;
- cred = p->p_tracecred;
- p->p_tracecred = NULL;
+ ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
}
@@ -1051,11 +1134,6 @@ ktr_writerequest(struct thread *td, struct ktr_request *req)
}
sx_sunlock(&allproc_lock);
- /*
- * We can't clear any pending requests in threads that have cached
- * them but not yet committed them, as those are per-thread. The
- * thread will have to clear it itself on system call return.
- */
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index d1a5c0d..03f6088 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -181,13 +181,12 @@ syscall_module_handler(struct module *mod, int what, void *arg)
error = syscall_deregister(data->offset, &data->old_sysent);
return (error);
default:
- return EOPNOTSUPP;
+ if (data->chainevh)
+ return (data->chainevh(mod, what, data->chainarg));
+ return (EOPNOTSUPP);
}
- if (data->chainevh)
- return (data->chainevh(mod, what, data->chainarg));
- else
- return (0);
+ /* NOTREACHED */
}
int
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 7627027..3a9c721 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
-#include <sys/sx.h>
#include <sys/ucontext.h>
#include <sys/thr.h>
#include <sys/rtprio.h>
@@ -431,40 +430,40 @@ thr_suspend(struct thread *td, struct thr_suspend_args *uap)
int
kern_thr_suspend(struct thread *td, struct timespec *tsp)
{
+ struct proc *p = td->td_proc;
struct timeval tv;
int error = 0;
int timo = 0;
- if (tsp != NULL) {
- if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
- return (EINVAL);
- }
-
if (td->td_pflags & TDP_WAKEUP) {
td->td_pflags &= ~TDP_WAKEUP;
return (0);
}
- PROC_LOCK(td->td_proc);
- if ((td->td_flags & TDF_THRWAKEUP) == 0) {
+ if (tsp != NULL) {
+ if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
+ return (EINVAL);
if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
error = EWOULDBLOCK;
else {
TIMESPEC_TO_TIMEVAL(&tv, tsp);
timo = tvtohz(&tv);
- error = msleep((void *)td, &td->td_proc->p_mtx,
- PCATCH, "lthr", timo);
}
}
+ PROC_LOCK(p);
+ if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
+ error = msleep((void *)td, &p->p_mtx,
+ PCATCH, "lthr", timo);
+
if (td->td_flags & TDF_THRWAKEUP) {
thread_lock(td);
td->td_flags &= ~TDF_THRWAKEUP;
thread_unlock(td);
- PROC_UNLOCK(td->td_proc);
+ PROC_UNLOCK(p);
return (0);
}
- PROC_UNLOCK(td->td_proc);
+ PROC_UNLOCK(p);
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
else if (error == ERESTART) {
diff --git a/sys/mips/rmi/board.c b/sys/mips/rmi/board.c
index 3bee863..4c49dac 100644
--- a/sys/mips/rmi/board.c
+++ b/sys/mips/rmi/board.c
@@ -283,14 +283,14 @@ xls_board_specific_overrides(struct xlr_board_info* board)
break;
case RMI_XLR_BOARD_ARIZONA_VIII:
-
- if (blk1->enabled) {
+ if (blk1->enabled) {
/* There is just one Octal PHY on the board and it is
* connected to the MII interface for NA Quad 0. */
- blk1->gmac_port[0].mii_addr = XLR_IO_GMAC_0_OFFSET;
- blk1->gmac_port[1].mii_addr = XLR_IO_GMAC_0_OFFSET;
- blk1->gmac_port[2].mii_addr = XLR_IO_GMAC_0_OFFSET;
- blk1->gmac_port[3].mii_addr = XLR_IO_GMAC_0_OFFSET;
+ for (i = 0; i < 4; i++) {
+ blk1->gmac_port[i].mii_addr =
+ XLR_IO_GMAC_0_OFFSET;
+ blk1->gmac_port[i].mdint_id = 0;
+ }
}
break;
diff --git a/sys/mips/rmi/dev/nlge/if_nlge.c b/sys/mips/rmi/dev/nlge/if_nlge.c
index 6495e4b..37e1c54 100644
--- a/sys/mips/rmi/dev/nlge/if_nlge.c
+++ b/sys/mips/rmi/dev/nlge/if_nlge.c
@@ -861,7 +861,7 @@ nlge_mii_read(struct device *dev, int phyaddr, int regidx)
int val;
sc = device_get_softc(dev);
- val = (sc->port_type != XLR_XGMII) ? (0xffff) :
+ val = (sc->port_type == XLR_XGMII) ? (0xffff) :
nlge_mii_read_internal(sc->mii_base, phyaddr, regidx);
return (val);
diff --git a/sys/mips/rmi/xlr_machdep.c b/sys/mips/rmi/xlr_machdep.c
index b34955d..8f96633 100644
--- a/sys/mips/rmi/xlr_machdep.c
+++ b/sys/mips/rmi/xlr_machdep.c
@@ -167,6 +167,14 @@ xlr_parse_mmu_options(void)
*/
xlr_ncores = 1;
cpu_map = xlr_boot1_info.cpu_online_map;
+
+#ifndef SMP /* Uniprocessor! */
+ if (cpu_map != 0x1) {
+ printf("WARNING: Starting uniprocessor kernel on cpumask [0x%lx]!\n"
+ "WARNING: Other CPUs will be unused.\n", (u_long)cpu_map);
+ cpu_map = 0x1;
+ }
+#endif
core0_thr_mask = cpu_map & 0xf;
switch (core0_thr_mask) {
case 1:
@@ -188,9 +196,9 @@ xlr_parse_mmu_options(void)
xlr_ncores++;
}
}
+ xlr_hw_thread_mask = cpu_map;
/* setup hardware processor id to cpu id mapping */
- xlr_hw_thread_mask = xlr_boot1_info.cpu_online_map;
for (i = 0; i< MAXCPU; i++)
xlr_cpuid_to_hwtid[i] =
xlr_hwtid_to_cpuid [i] = -1;
diff --git a/sys/net/if.c b/sys/net/if.c
index bd54acf..3c8486a 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -92,6 +92,11 @@
#include <security/mac/mac_framework.h>
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <compat/freebsd32/freebsd32.h>
+#endif
+
struct ifindex_entry {
struct ifnet *ife_ifnet;
};
@@ -2402,6 +2407,17 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
return (error);
}
+#ifdef COMPAT_FREEBSD32
+struct ifconf32 {
+ int32_t ifc_len;
+ union {
+ uint32_t ifcu_buf;
+ uint32_t ifcu_req;
+ } ifc_ifcu;
+};
+#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32)
+#endif
+
/*
* Interface ioctls.
*/
@@ -2416,10 +2432,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
switch (cmd) {
case SIOCGIFCONF:
case OSIOCGIFCONF:
-#ifdef __amd64__
+ return (ifconf(cmd, data));
+
+#ifdef COMPAT_FREEBSD32
case SIOCGIFCONF32:
+ {
+ struct ifconf32 *ifc32;
+ struct ifconf ifc;
+
+ ifc32 = (struct ifconf32 *)data;
+ ifc.ifc_len = ifc32->ifc_len;
+ ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
+
+ return (ifconf(SIOCGIFCONF, (void *)&ifc));
+ }
#endif
- return (ifconf(cmd, data));
}
ifr = (struct ifreq *)data;
@@ -2646,23 +2673,12 @@ static int
ifconf(u_long cmd, caddr_t data)
{
struct ifconf *ifc = (struct ifconf *)data;
-#ifdef __amd64__
- struct ifconf32 *ifc32 = (struct ifconf32 *)data;
- struct ifconf ifc_swab;
-#endif
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifreq ifr;
struct sbuf *sb;
int error, full = 0, valid_len, max_len;
-#ifdef __amd64__
- if (cmd == SIOCGIFCONF32) {
- ifc_swab.ifc_len = ifc32->ifc_len;
- ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
- ifc = &ifc_swab;
- }
-#endif
/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
max_len = MAXPHYS - 1;
@@ -2752,10 +2768,6 @@ again:
}
ifc->ifc_len = valid_len;
-#ifdef __amd64__
- if (cmd == SIOCGIFCONF32)
- ifc32->ifc_len = valid_len;
-#endif
sbuf_finish(sb);
error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
sbuf_delete(sb);
diff --git a/sys/net/if.h b/sys/net/if.h
index ae0daf5..a99b4a7 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -391,16 +391,6 @@ struct ifconf {
#define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */
};
-#if defined (__amd64__)
-struct ifconf32 {
- int ifc_len; /* size of associated buffer */
- union {
- u_int ifcu_buf;
- u_int ifcu_req;
- } ifc_ifcu;
-};
-#endif
-
/*
* interface groups
*/
diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h
index a3e5469..9f8810c 100644
--- a/sys/sys/ktrace.h
+++ b/sys/sys/ktrace.h
@@ -191,8 +191,6 @@ struct stat;
#define KTRFAC_DROP 0x20000000 /* last event was dropped */
#ifdef _KERNEL
-extern struct mtx ktrace_mtx;
-
void ktrnamei(char *);
void ktrcsw(int, int);
void ktrpsig(int, sig_t, sigset_t *, int);
@@ -200,7 +198,9 @@ void ktrgenio(int, enum uio_rw, struct uio *, int);
void ktrsyscall(int, int narg, register_t args[]);
void ktrsysctl(int *name, u_int namelen);
void ktrsysret(int, int, register_t);
+void ktrprocexec(struct proc *, struct ucred **, struct vnode **);
void ktrprocexit(struct thread *);
+void ktrprocfork(struct proc *, struct proc *);
void ktruserret(struct thread *);
void ktrstruct(const char *, void *, size_t);
#define ktrsockaddr(s) \
diff --git a/sys/sys/sockio.h b/sys/sys/sockio.h
index 2af2467..4c1c483 100644
--- a/sys/sys/sockio.h
+++ b/sys/sys/sockio.h
@@ -62,9 +62,6 @@
#define SIOCSIFBRDADDR _IOW('i', 19, struct ifreq) /* set broadcast addr */
#define OSIOCGIFCONF _IOWR('i', 20, struct ifconf) /* get ifnet list */
#define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */
-#if defined (__amd64__)
-#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) /* get ifnet list */
-#endif
#define OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) /* get net addr mask */
#define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */
#define SIOCSIFNETMASK _IOW('i', 22, struct ifreq) /* set net addr mask */
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index b359bd4..bea235a 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1460,8 +1460,8 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
* Completion routine for asynchronous reads and writes from/to swap.
* Also called manually by synchronous code to finish up a bp.
*
- * For READ operations, the pages are PG_BUSY'd. For WRITE operations,
- * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY
+ * For READ operations, the pages are VPO_BUSY'd. For WRITE operations,
+ * the pages are vm_page_t->busy'd. For READ operations, we VPO_BUSY
* unbusy all pages except the 'main' request page. For WRITE
* operations, we vm_page_t->busy'd unbusy all pages ( we can do this
* because we marked them all VM_PAGER_PEND on return from putpages ).
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 2e0f001..40c317d 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -339,15 +339,11 @@ vmspace_dofree(struct vmspace *vm)
void
vmspace_free(struct vmspace *vm)
{
- int refcnt;
if (vm->vm_refcnt == 0)
panic("vmspace_free: attempt to free already freed vmspace");
- do
- refcnt = vm->vm_refcnt;
- while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
- if (refcnt == 1)
+ if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
vmspace_dofree(vm);
}
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index 18a1edf..8715b41 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -236,7 +236,7 @@ struct vmspace {
caddr_t vm_taddr; /* (c) user virtual address of text */
caddr_t vm_daddr; /* (c) user virtual address of data */
caddr_t vm_maxsaddr; /* user VA at max stack growth */
- int vm_refcnt; /* number of references */
+ volatile int vm_refcnt; /* number of references */
/*
* Keep the PMAP last, so that CPU-specific variations of that
* structure on a single architecture don't result in offset
diff --git a/sys/xen/blkif.h b/sys/xen/blkif.h
new file mode 100644
index 0000000..48b71ea
--- /dev/null
+++ b/sys/xen/blkif.h
@@ -0,0 +1,145 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol. Used to generate ring structs which contain
+ * the elements common to all protocols only. This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places. */
+struct blkif_common_request {
+ char dummy;
+};
+struct blkif_common_response {
+ char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+};
+struct blkif_x86_32_response {
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t __attribute__((__aligned__(8))) id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+};
+struct blkif_x86_64_response {
+ uint64_t __attribute__((__aligned__(8))) id;
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+/*
+ * Maximum number of requests that can be active for a given instance
+ * regardless of the protocol in use, based on the ring size. This constant
+ * facilitates resource pre-allocation in backend drivers since the size is
+ * known well in advance of attaching to a front end.
+ */
+#define BLKIF_MAX_RING_REQUESTS(_sz) \
+ MAX(__RING_SIZE((blkif_x86_64_sring_t *)NULL, _sz), \
+ MAX(__RING_SIZE((blkif_x86_32_sring_t *)NULL, _sz), \
+ __RING_SIZE((blkif_sring_t *)NULL, _sz)))
+
+/*
+ * The number of ring pages required to support a given number of requests
+ * for a given instance regardless of the protocol in use.
+ */
+#define BLKIF_RING_PAGES(_entries) \
+ MAX(__RING_PAGES((blkif_x86_64_sring_t *)NULL, _entries), \
+ MAX(__RING_PAGES((blkif_x86_32_sring_t *)NULL, _entries), \
+ __RING_PAGES((blkif_sring_t *)NULL, _entries)))
+
+union blkif_back_rings {
+ blkif_back_ring_t native;
+ blkif_common_back_ring_t common;
+ blkif_x86_32_back_ring_t x86_32;
+ blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+ BLKIF_PROTOCOL_NATIVE = 1,
+ BLKIF_PROTOCOL_X86_32 = 2,
+ BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c
index f280d12..3832277 100644
--- a/sys/xen/evtchn/evtchn.c
+++ b/sys/xen/evtchn/evtchn.c
@@ -492,15 +492,15 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain,
int
bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
unsigned int remote_port, const char *devname,
- driver_filter_t filter, driver_intr_t handler,
- unsigned long irqflags, unsigned int *irqp)
+ driver_intr_t handler, void *arg, unsigned long irqflags,
+ unsigned int *irqp)
{
unsigned int irq;
int error;
irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
intr_register_source(&xp->xp_pins[irq].xp_intsrc);
- error = intr_add_handler(devname, irq, filter, handler, NULL,
+ error = intr_add_handler(devname, irq, NULL, handler, arg,
irqflags, &xp->xp_pins[irq].xp_cookie);
if (error) {
unbind_from_irq(irq);
diff --git a/sys/xen/gnttab.c b/sys/xen/gnttab.c
index ae44e8f..4ece182 100644
--- a/sys/xen/gnttab.c
+++ b/sys/xen/gnttab.c
@@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$");
/* External tools reserve first few grant table entries. */
#define NR_RESERVED_ENTRIES 8
-#define GNTTAB_LIST_END 0xffffffff
#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
static grant_ref_t **gnttab_list;
@@ -66,7 +65,7 @@ get_free_entries(int count, int *entries)
{
int ref, error;
grant_ref_t head;
-
+
mtx_lock(&gnttab_list_lock);
if ((gnttab_free_count < count) &&
((error = gnttab_expand(count - gnttab_free_count)) != 0)) {
@@ -79,7 +78,7 @@ get_free_entries(int count, int *entries)
head = gnttab_entry(head);
gnttab_free_head = gnttab_entry(head);
gnttab_entry(head) = GNTTAB_LIST_END;
- mtx_unlock(&gnttab_list_lock);
+ mtx_unlock(&gnttab_list_lock);
*entries = ref;
return (0);
@@ -122,7 +121,7 @@ put_free_entry(grant_ref_t ref)
gnttab_free_head = ref;
gnttab_free_count++;
check_free_callbacks();
- mtx_unlock(&gnttab_list_lock);
+ mtx_unlock(&gnttab_list_lock);
}
/*
@@ -136,7 +135,7 @@ gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly,
int error, ref;
error = get_free_entries(1, &ref);
-
+
if (unlikely(error))
return (error);
@@ -166,9 +165,9 @@ int
gnttab_query_foreign_access(grant_ref_t ref)
{
uint16_t nflags;
-
+
nflags = shared[ref].flags;
-
+
return (nflags & (GTF_reading|GTF_writing));
}
@@ -180,7 +179,7 @@ gnttab_end_foreign_access_ref(grant_ref_t ref)
nflags = shared[ref].flags;
do {
if ( (flags = nflags) & (GTF_reading|GTF_writing) ) {
- printf("WARNING: g.e. still in use!\n");
+ printf("%s: WARNING: g.e. still in use!\n", __func__);
return (0);
}
} while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) !=
@@ -201,7 +200,44 @@ gnttab_end_foreign_access(grant_ref_t ref, void *page)
else {
/* XXX This needs to be fixed so that the ref and page are
placed on a list to be freed up later. */
- printf("WARNING: leaking g.e. and page still in use!\n");
+ printf("%s: WARNING: leaking g.e. and page still in use!\n",
+ __func__);
+ }
+}
+
+void
+gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs)
+{
+ grant_ref_t *last_ref;
+ grant_ref_t head;
+ grant_ref_t tail;
+
+ head = GNTTAB_LIST_END;
+ tail = *refs;
+ last_ref = refs + count;
+ while (refs != last_ref) {
+
+ if (gnttab_end_foreign_access_ref(*refs)) {
+ gnttab_entry(*refs) = head;
+ head = *refs;
+ } else {
+ /*
+ * XXX This needs to be fixed so that the ref
+ * is placed on a list to be freed up later.
+ */
+ printf("%s: WARNING: leaking g.e. still in use!\n",
+ __func__);
+ count--;
+ }
+ refs++;
+ }
+
+ if (count != 0) {
+ mtx_lock(&gnttab_list_lock);
+ gnttab_free_count += count;
+ gnttab_entry(tail) = gnttab_free_head;
+ gnttab_free_head = head;
+ mtx_unlock(&gnttab_list_lock);
}
}
@@ -216,7 +252,7 @@ gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn,
return (error);
gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
-
+
*result = ref;
return (0);
}
@@ -282,16 +318,16 @@ gnttab_free_grant_references(grant_ref_t head)
{
grant_ref_t ref;
int count = 1;
-
+
if (head == GNTTAB_LIST_END)
return;
-
- mtx_lock(&gnttab_list_lock);
+
ref = head;
while (gnttab_entry(ref) != GNTTAB_LIST_END) {
ref = gnttab_entry(ref);
count++;
}
+ mtx_lock(&gnttab_list_lock);
gnttab_entry(ref) = gnttab_free_head;
gnttab_free_head = head;
gnttab_free_count += count;
@@ -403,7 +439,7 @@ grow_gnttab_list(unsigned int more_frames)
check_free_callbacks();
return (0);
-
+
grow_nomem:
for ( ; i >= nr_grant_frames; i--)
free(gnttab_list[i], M_DEVBUF);
@@ -490,7 +526,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx)
if (shared == NULL) {
vm_offset_t area;
-
+
area = kmem_alloc_nofault(kernel_map,
PAGE_SIZE * max_nr_grant_frames());
KASSERT(area, ("can't allocate VM space for grant table"));
@@ -502,7 +538,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx)
((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V);
free(frames, M_DEVBUF);
-
+
return (0);
}
@@ -517,7 +553,7 @@ gnttab_resume(void)
int
gnttab_suspend(void)
-{
+{
int i;
for (i = 0; i < nr_grant_frames; i++)
@@ -532,7 +568,8 @@ gnttab_suspend(void)
static vm_paddr_t resume_frames;
-static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+static int
+gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct xen_add_to_physmap xatp;
unsigned int i = end_idx;
@@ -552,7 +589,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
if (shared == NULL) {
vm_offset_t area;
-
+
area = kmem_alloc_nofault(kernel_map,
PAGE_SIZE * max_nr_grant_frames());
KASSERT(area, ("can't allocate VM space for grant table"));
@@ -643,10 +680,10 @@ gnttab_init()
if (gnttab_list[i] == NULL)
goto ini_nomem;
}
-
+
if (gnttab_resume())
return (ENODEV);
-
+
nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
@@ -670,4 +707,3 @@ ini_nomem:
}
MTX_SYSINIT(gnttab, &gnttab_list_lock, "GNTTAB LOCK", MTX_DEF);
-//SYSINIT(gnttab, SI_SUB_PSEUDO, SI_ORDER_FIRST, gnttab_init, NULL);
diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h
index 8348af5..1741ec3 100644
--- a/sys/xen/gnttab.h
+++ b/sys/xen/gnttab.h
@@ -43,6 +43,8 @@
#include <machine/xen/xen-os.h>
#include <xen/features.h>
+#define GNTTAB_LIST_END GRANT_REF_INVALID
+
struct gnttab_free_callback {
struct gnttab_free_callback *next;
void (*fn)(void *);
@@ -74,6 +76,13 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref);
*/
void gnttab_end_foreign_access(grant_ref_t ref, void *page);
+/*
+ * Eventually end access through the given array of grant references.
+ * Access will be ended immediately iff the grant entry is not in use,
+ * otherwise it will happen some time later
+ */
+void gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs);
+
int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result);
unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
diff --git a/sys/xen/interface/grant_table.h b/sys/xen/interface/grant_table.h
index 26f2c35..e76ca67 100644
--- a/sys/xen/interface/grant_table.h
+++ b/sys/xen/interface/grant_table.h
@@ -159,6 +159,8 @@ typedef struct grant_entry grant_entry_t;
*/
typedef uint32_t grant_ref_t;
+#define GRANT_REF_INVALID 0xffffffff
+
/*
* Handle to track a mapping created via a grant reference.
*/
diff --git a/sys/xen/interface/hvm/params.h b/sys/xen/interface/hvm/params.h
index 6befa78..d846731 100644
--- a/sys/xen/interface/hvm/params.h
+++ b/sys/xen/interface/hvm/params.h
@@ -95,4 +95,30 @@
#define HVM_NR_PARAMS 15
+#ifdef XENHVM
+/**
+ * Retrieve an HVM setting from the hypervisor.
+ *
+ * \param index The index of the HVM parameter to retrieve.
+ *
+ * \return On error, 0. Otherwise the value of the requested parameter.
+ */
+static inline unsigned long
+hvm_get_parameter(int index)
+{
+ struct xen_hvm_param xhv;
+ int error;
+
+ xhv.domid = DOMID_SELF;
+ xhv.index = index;
+ error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
+ if (error) {
+ printf("hvm_get_parameter: failed to get %d, error %d\n",
+ index, error);
+ return (0);
+ }
+ return (xhv.value);
+}
+#endif
+
#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h
index 9e2d3d0..020936b 100644
--- a/sys/xen/interface/io/blkif.h
+++ b/sys/xen/interface/io/blkif.h
@@ -78,11 +78,19 @@
#define BLKIF_OP_FLUSH_DISKCACHE 3
/*
- * Maximum scatter/gather segments per request.
- * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
- * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ * Maximum scatter/gather segments associated with a request header block.
*/
-#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+#define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK 11
+
+/*
+ * Maximum scatter/gather segments associated with a segment block.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK 14
+
+/*
+ * Maximum scatter/gather segments per request (header + segment blocks).
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 255
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
@@ -90,6 +98,7 @@ struct blkif_request_segment {
/* @last_sect: last sector in frame to transfer (inclusive). */
uint8_t first_sect, last_sect;
};
+typedef struct blkif_request_segment blkif_request_segment_t;
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
@@ -97,7 +106,7 @@ struct blkif_request {
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
};
typedef struct blkif_request blkif_request_t;
@@ -124,10 +133,22 @@ typedef struct blkif_response blkif_response_t;
DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
+#define BLKRING_GET_SG_REQUEST(_r, _idx) \
+ ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx))
+
#define VDISK_CDROM 0x1
#define VDISK_REMOVABLE 0x2
#define VDISK_READONLY 0x4
+/*
+ * The number of ring request blocks required to handle an I/O
+ * request containing _segs segments.
+ */
+#define BLKIF_SEGS_TO_BLOCKS(_segs) \
+ ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \
+ + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \
+ / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
/*
diff --git a/sys/xen/interface/io/protocols.h b/sys/xen/interface/io/protocols.h
index 77bd1bd..fd52934 100644
--- a/sys/xen/interface/io/protocols.h
+++ b/sys/xen/interface/io/protocols.h
@@ -26,6 +26,7 @@
#define XEN_IO_PROTO_ABI_X86_32 "x86_32-abi"
#define XEN_IO_PROTO_ABI_X86_64 "x86_64-abi"
#define XEN_IO_PROTO_ABI_IA64 "ia64-abi"
+#define XEN_IO_PROTO_ABI_POWERPC64 "powerpc64-abi"
#if defined(__i386__)
# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
@@ -33,6 +34,8 @@
# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64
#elif defined(__ia64__)
# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64
+#elif defined(__powerpc64__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64
#else
# error arch fixup needed here
#endif
diff --git a/sys/xen/interface/io/ring.h b/sys/xen/interface/io/ring.h
index 6ce1d0d..6b7fd74 100644
--- a/sys/xen/interface/io/ring.h
+++ b/sys/xen/interface/io/ring.h
@@ -45,13 +45,29 @@ typedef unsigned int RING_IDX;
#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
/*
+ * The amount of space reserved in the shared ring for accounting information.
+ */
+#define __RING_HEADER_SIZE(_s) \
+ ((intptr_t)(_s)->ring - (intptr_t)(_s))
+
+/*
* Calculate size of a shared ring, given the total available space for the
* ring and indexes (_sz), and the name tag of the request/response structure.
* A ring contains as many entries as will fit, rounded down to the nearest
* power of two (so we can mask with (size-1) to loop around).
*/
#define __RING_SIZE(_s, _sz) \
- (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+ (__RD32(((_sz) - __RING_HEADER_SIZE(_s)) / sizeof((_s)->ring[0])))
+
+/*
+ * The number of pages needed to support a given number of request/reponse
+ * entries. The entry count is rounded down to the nearest power of two
+ * as required by the ring macros.
+ */
+#define __RING_PAGES(_s, _entries) \
+ ((__RING_HEADER_SIZE(_s) \
+ + (__RD32(_entries) * sizeof((_s)->ring[0])) \
+ + PAGE_SIZE - 1) / PAGE_SIZE)
/*
* Macros to make the correct C datatypes for a new kind of ring.
diff --git a/sys/xen/interface/io/xenbus.h b/sys/xen/interface/io/xenbus.h
index 4a053df..5e24f31 100644
--- a/sys/xen/interface/io/xenbus.h
+++ b/sys/xen/interface/io/xenbus.h
@@ -36,6 +36,9 @@
enum xenbus_state {
XenbusStateUnknown = 0,
+ /*
+ * Initializing: Back-end is initializing.
+ */
XenbusStateInitialising = 1,
/*
@@ -49,6 +52,9 @@ enum xenbus_state {
*/
XenbusStateInitialised = 3,
+ /*
+ * Connected: The normal state for a front to backend connection.
+ */
XenbusStateConnected = 4,
/*
@@ -56,6 +62,9 @@ enum xenbus_state {
*/
XenbusStateClosing = 5,
+ /*
+ * Closed: No connection exists between front and back end.
+ */
XenbusStateClosed = 6,
/*
diff --git a/sys/xen/reboot.c b/sys/xen/reboot.c
deleted file mode 100644
index 04ba132..0000000
--- a/sys/xen/reboot.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- *
- * Copyright (c) 2004 Christian Limpach.
- * Copyright (c) 2004-2006,2008 Kip Macy
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by Christian Limpach.
- * 4. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/reboot.h>
-#include <sys/sched.h>
-#include <sys/smp.h>
-#include <sys/systm.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <xen/gnttab.h>
-#include <xen/xen_intr.h>
-#include <xen/xenbus/xenbusvar.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-#ifdef XENHVM
-
-#include <dev/xen/xenpci/xenpcivar.h>
-
-#else
-
-static void xen_suspend(void);
-
-#endif
-
-static void
-shutdown_handler(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- char *str;
- struct xenbus_transaction xbt;
- int error, howto;
-
- howto = 0;
-
- again:
- error = xenbus_transaction_start(&xbt);
- if (error)
- return;
-
- error = xenbus_read(xbt, "control", "shutdown", NULL, (void **) &str);
-
- /* Ignore read errors and empty reads. */
- if (error || strlen(str) == 0) {
- xenbus_transaction_end(xbt, 1);
- return;
- }
-
- xenbus_write(xbt, "control", "shutdown", "");
-
- error = xenbus_transaction_end(xbt, 0);
- if (error == EAGAIN) {
- free(str, M_DEVBUF);
- goto again;
- }
-
- if (strcmp(str, "reboot") == 0)
- howto = 0;
- else if (strcmp(str, "poweroff") == 0)
- howto |= (RB_POWEROFF | RB_HALT);
- else if (strcmp(str, "halt") == 0)
-#ifdef XENHVM
- /*
- * We rely on acpi powerdown to halt the VM.
- */
- howto |= (RB_POWEROFF | RB_HALT);
-#else
- howto |= RB_HALT;
-#endif
- else if (strcmp(str, "suspend") == 0)
- howto = -1;
- else {
- printf("Ignoring shutdown request: %s\n", str);
- goto done;
- }
-
- if (howto == -1) {
- xen_suspend();
- goto done;
- }
-
- shutdown_nice(howto);
- done:
- free(str, M_DEVBUF);
-}
-
-#ifndef XENHVM
-
-/*
- * In HV mode, we let acpi take care of halts and reboots.
- */
-
-static void
-xen_shutdown_final(void *arg, int howto)
-{
-
- if (howto & (RB_HALT | RB_POWEROFF))
- HYPERVISOR_shutdown(SHUTDOWN_poweroff);
- else
- HYPERVISOR_shutdown(SHUTDOWN_reboot);
-}
-
-#endif
-
-static struct xenbus_watch shutdown_watch = {
- .node = "control/shutdown",
- .callback = shutdown_handler
-};
-
-static void
-setup_shutdown_watcher(void *unused)
-{
-
- if (register_xenbus_watch(&shutdown_watch))
- printf("Failed to set shutdown watcher\n");
-#ifndef XENHVM
- EVENTHANDLER_REGISTER(shutdown_final, xen_shutdown_final, NULL,
- SHUTDOWN_PRI_LAST);
-#endif
-}
-
-SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL);
-
-#ifndef XENHVM
-
-extern void xencons_suspend(void);
-extern void xencons_resume(void);
-
-static void
-xen_suspend()
-{
- int i, j, k, fpp;
- unsigned long max_pfn, start_info_mfn;
-
-#ifdef SMP
- cpumask_t map;
- /*
- * Bind us to CPU 0 and stop any other VCPUs.
- */
- thread_lock(curthread);
- sched_bind(curthread, 0);
- thread_unlock(curthread);
- KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0"));
-
- map = PCPU_GET(other_cpus) & ~stopped_cpus;
- if (map)
- stop_cpus(map);
-#endif
-
- if (DEVICE_SUSPEND(root_bus) != 0) {
- printf("xen_suspend: device_suspend failed\n");
-#ifdef SMP
- if (map)
- restart_cpus(map);
-#endif
- return;
- }
-
- local_irq_disable();
-
- xencons_suspend();
- gnttab_suspend();
-
- max_pfn = HYPERVISOR_shared_info->arch.max_pfn;
-
- void *shared_info = HYPERVISOR_shared_info;
- HYPERVISOR_shared_info = NULL;
- pmap_kremove((vm_offset_t) shared_info);
- PT_UPDATES_FLUSH();
-
- xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn);
- xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn);
-
- /*
- * We'll stop somewhere inside this hypercall. When it returns,
- * we'll start resuming after the restore.
- */
- start_info_mfn = VTOMFN(xen_start_info);
- pmap_suspend();
- HYPERVISOR_suspend(start_info_mfn);
- pmap_resume();
-
- pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info);
- HYPERVISOR_shared_info = shared_info;
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- VTOMFN(xen_pfn_to_mfn_frame_list_list);
-
- fpp = PAGE_SIZE/sizeof(unsigned long);
- for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
- if ((j % fpp) == 0) {
- k++;
- xen_pfn_to_mfn_frame_list_list[k] =
- VTOMFN(xen_pfn_to_mfn_frame_list[k]);
- j = 0;
- }
- xen_pfn_to_mfn_frame_list[k][j] =
- VTOMFN(&xen_phys_machine[i]);
- }
- HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
-
- gnttab_resume();
- irq_resume();
- local_irq_enable();
- xencons_resume();
-
-#ifdef CONFIG_SMP
- for_each_cpu(i)
- vcpu_prepare(i);
-
-#endif
- /*
- * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
- * the VCPU hotplug callback can race with our vcpu_prepare
- */
- DEVICE_RESUME(root_bus);
-
-#ifdef SMP
- thread_lock(curthread);
- sched_unbind(curthread);
- thread_unlock(curthread);
- if (map)
- restart_cpus(map);
-#endif
-}
-
-#endif
diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h
index 68f5943..2e753e6 100644
--- a/sys/xen/xen_intr.h
+++ b/sys/xen/xen_intr.h
@@ -76,7 +76,7 @@ extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu,
*/
extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
unsigned int remote_port, const char *devname,
- driver_filter_t filter, driver_intr_t handler,
+ driver_intr_t handler, void *arg,
unsigned long irqflags, unsigned int *irqp);
/*
diff --git a/sys/xen/xenbus/init.txt b/sys/xen/xenbus/init.txt
deleted file mode 100644
index 4249549..0000000
--- a/sys/xen/xenbus/init.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-- frontend driver initializes static xenbus_driver with _ids, _probe, _remove,
-_resume, _otherend_changed
-
- - initialization calls xenbus_register_frontend(xenbus_driver)
-
- - xenbus_register_frontend sets read_otherend details to read_backend_details
- then calls xenbus_register_driver_common(xenbus_driver, xenbus_frontend)
-
- - xenbus_register_driver_common sets underlying driver name to xenbus_driver name
- underlying driver bus to xenbus_frontend's bus, driver's probe to xenbus_dev_probe
- driver's remove to xenbus_dev_remove then calls driver_register
-
diff --git a/sys/xen/xenbus/xenbus_client.c b/sys/xen/xenbus/xenbus.c
index 740d664..c3e5fee 100644
--- a/sys/xen/xenbus/xenbus_client.c
+++ b/sys/xen/xenbus/xenbus.c
@@ -1,8 +1,4 @@
/******************************************************************************
- * Client-facing interface for the Xenbus driver. In other words, the
- * interface between the Xenbus and the device-specific code, be it the
- * frontend or the backend of that driver.
- *
* Copyright (C) 2005 XenSource Ltd
*
* This file may be distributed separately from the Linux kernel, or
@@ -27,6 +23,14 @@
* IN THE SOFTWARE.
*/
+/**
+ * \file xenbus.c
+ *
+ * \brief Client-facing interface for the Xenbus driver.
+ *
+ * In other words, the interface between the Xenbus and the device-specific
+ * code, be it the frontend or the backend of that driver.
+ */
#if 0
#define DPRINTK(fmt, args...) \
@@ -39,9 +43,12 @@
__FBSDID("$FreeBSD$");
#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/libkern.h>
+#include <sys/sbuf.h>
#include <machine/xen/xen-os.h>
#include <xen/hypervisor.h>
@@ -50,6 +57,34 @@ __FBSDID("$FreeBSD$");
#include <xen/xenbus/xenbusvar.h>
#include <machine/stdarg.h>
+MALLOC_DEFINE(M_XENBUS, "xenbus", "XenBus Support");
+
+/*------------------------- Private Functions --------------------------------*/
+/**
+ * \brief Construct the error path corresponding to the given XenBus
+ * device.
+ *
+ * \param dev The XenBus device for which we are constructing an error path.
+ *
+ * \return On success, the contructed error path. Otherwise NULL.
+ *
+ * It is the caller's responsibility to free any returned error path
+ * node using the M_XENBUS malloc type.
+ */
+static char *
+error_path(device_t dev)
+{
+ char *path_buffer = malloc(strlen("error/")
+ + strlen(xenbus_get_node(dev)) + 1,M_XENBUS, M_WAITOK);
+
+ strcpy(path_buffer, "error/");
+ strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev));
+
+ return (path_buffer);
+}
+
+/*--------------------------- Public Functions -------------------------------*/
+/*-------- API comments for these methods can be found in xenbusvar.h --------*/
const char *
xenbus_strstate(XenbusState state)
{
@@ -67,15 +102,15 @@ xenbus_strstate(XenbusState state)
}
int
-xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch,
- void (*callback)(struct xenbus_watch *, const char **, unsigned int))
+xenbus_watch_path(device_t dev, char *path, struct xs_watch *watch,
+ xs_watch_cb_t *callback)
{
int error;
watch->node = path;
watch->callback = callback;
- error = register_xenbus_watch(watch);
+ error = xs_register_watch(watch);
if (error) {
watch->node = NULL;
@@ -88,12 +123,12 @@ xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch,
int
xenbus_watch_path2(device_t dev, const char *path,
- const char *path2, struct xenbus_watch *watch,
- void (*callback)(struct xenbus_watch *, const char **, unsigned int))
+ const char *path2, struct xs_watch *watch,
+ xs_watch_cb_t *callback)
{
int error;
char *state = malloc(strlen(path) + 1 + strlen(path2) + 1,
- M_DEVBUF, M_WAITOK);
+ M_XENBUS, M_WAITOK);
strcpy(state, path);
strcat(state, "/");
@@ -101,46 +136,27 @@ xenbus_watch_path2(device_t dev, const char *path,
error = xenbus_watch_path(dev, state, watch, callback);
if (error) {
- free(state, M_DEVBUF);
+ free(state,M_XENBUS);
}
return (error);
}
-/**
- * Return the path to the error node for the given device, or NULL on failure.
- * If the value returned is non-NULL, then it is the caller's to kfree.
- */
-static char *
-error_path(device_t dev)
-{
- char *path_buffer = malloc(strlen("error/")
- + strlen(xenbus_get_node(dev)) + 1, M_DEVBUF, M_WAITOK);
-
- strcpy(path_buffer, "error/");
- strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev));
-
- return (path_buffer);
-}
-
-
-static void
-_dev_error(device_t dev, int err, const char *fmt, va_list ap)
+void
+xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap)
{
int ret;
unsigned int len;
char *printf_buffer = NULL, *path_buffer = NULL;
#define PRINTF_BUFFER_SIZE 4096
- printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK);
+ printf_buffer = malloc(PRINTF_BUFFER_SIZE,M_XENBUS, M_WAITOK);
len = sprintf(printf_buffer, "%i ", err);
ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
KASSERT(len + ret <= PRINTF_BUFFER_SIZE-1, ("xenbus error message too big"));
-#if 0
- dev_err(&dev->dev, "%s\n", printf_buffer);
-#endif
+ device_printf(dev, "Error %s\n", printf_buffer);
path_buffer = error_path(dev);
if (path_buffer == NULL) {
@@ -149,7 +165,7 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap)
goto fail;
}
- if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
+ if (xs_write(XST_NIL, path_buffer, "error", printf_buffer) != 0) {
printf("xenbus: failed to write error node for %s (%s)\n",
xenbus_get_node(dev), printf_buffer);
goto fail;
@@ -157,9 +173,9 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap)
fail:
if (printf_buffer)
- free(printf_buffer, M_DEVBUF);
+ free(printf_buffer,M_XENBUS);
if (path_buffer)
- free(path_buffer, M_DEVBUF);
+ free(path_buffer,M_XENBUS);
}
void
@@ -168,41 +184,45 @@ xenbus_dev_error(device_t dev, int err, const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
- _dev_error(dev, err, fmt, ap);
+ xenbus_dev_verror(dev, err, fmt, ap);
va_end(ap);
}
void
+xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list ap)
+{
+ xenbus_dev_verror(dev, err, fmt, ap);
+ device_printf(dev, "Fatal error. Transitioning to Closing State\n");
+ xenbus_set_state(dev, XenbusStateClosing);
+}
+
+void
xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
- _dev_error(dev, err, fmt, ap);
+ xenbus_dev_vfatal(dev, err, fmt, ap);
va_end(ap);
-
- xenbus_set_state(dev, XenbusStateClosing);
}
int
-xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp)
+xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp)
{
int error;
- grant_ref_t ref;
error = gnttab_grant_foreign_access(
- xenbus_get_otherend_id(dev), ring_mfn, 0, &ref);
+ xenbus_get_otherend_id(dev), ring_mfn, 0, refp);
if (error) {
xenbus_dev_fatal(dev, error, "granting access to ring page");
return (error);
}
- *refp = ref;
return (0);
}
int
-xenbus_alloc_evtchn(device_t dev, int *port)
+xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port)
{
struct evtchn_alloc_unbound alloc_unbound;
int err;
@@ -222,7 +242,7 @@ xenbus_alloc_evtchn(device_t dev, int *port)
}
int
-xenbus_free_evtchn(device_t dev, int port)
+xenbus_free_evtchn(device_t dev, evtchn_port_t port)
{
struct evtchn_close close;
int err;
@@ -240,12 +260,29 @@ xenbus_free_evtchn(device_t dev, int port)
XenbusState
xenbus_read_driver_state(const char *path)
{
- XenbusState result;
+ XenbusState result;
+ int error;
+
+ error = xs_gather(XST_NIL, path, "state", "%d", &result, NULL);
+ if (error)
+ result = XenbusStateClosed;
+
+ return (result);
+}
+
+int
+xenbus_dev_is_online(device_t dev)
+{
+ const char *path;
int error;
+ int value;
- error = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
- if (error)
- result = XenbusStateClosed;
+ path = xenbus_get_node(dev);
+ error = xs_gather(XST_NIL, path, "online", "%d", &value, NULL);
+ if (error != 0) {
+ /* Default to not online. */
+ value = 0;
+ }
- return (result);
+ return (value);
}
diff --git a/sys/xen/xenbus/xenbus_comms.c b/sys/xen/xenbus/xenbus_comms.c
deleted file mode 100644
index 2f03955..0000000
--- a/sys/xen/xenbus/xenbus_comms.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * xenbus_comms.c
- *
- * Low level code to talks to Xen Store: ringbuffer and event channel.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/sx.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/syslog.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-
-#include <xen/xen_intr.h>
-#include <xen/evtchn.h>
-#include <xen/interface/io/xs_wire.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-static unsigned int xenstore_irq;
-
-static inline struct xenstore_domain_interface *
-xenstore_domain_interface(void)
-{
-
- return (struct xenstore_domain_interface *)xen_store;
-}
-
-static void
-xb_intr(void * arg __attribute__((unused)))
-{
-
- wakeup(xen_store);
-}
-
-static int
-xb_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
-{
-
- return ((prod - cons) <= XENSTORE_RING_SIZE);
-}
-
-static void *
-xb_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
- char *buf, uint32_t *len)
-{
-
- *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
- if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
- *len = XENSTORE_RING_SIZE - (prod - cons);
- return (buf + MASK_XENSTORE_IDX(prod));
-}
-
-static const void *
-xb_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
- const char *buf, uint32_t *len)
-{
-
- *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
- if ((prod - cons) < *len)
- *len = prod - cons;
- return (buf + MASK_XENSTORE_IDX(cons));
-}
-
-int
-xb_write(const void *tdata, unsigned len, struct lock_object *lock)
-{
- struct xenstore_domain_interface *intf = xenstore_domain_interface();
- XENSTORE_RING_IDX cons, prod;
- const char *data = (const char *)tdata;
- int error;
-
- while (len != 0) {
- void *dst;
- unsigned int avail;
-
- while ((intf->req_prod - intf->req_cons)
- == XENSTORE_RING_SIZE) {
- error = _sleep(intf,
- lock,
- PCATCH, "xbwrite", hz/10);
- if (error && error != EWOULDBLOCK)
- return (error);
- }
-
- /* Read indexes, then verify. */
- cons = intf->req_cons;
- prod = intf->req_prod;
- mb();
- if (!xb_check_indexes(cons, prod)) {
- intf->req_cons = intf->req_prod = 0;
- return (EIO);
- }
-
- dst = xb_get_output_chunk(cons, prod, intf->req, &avail);
- if (avail == 0)
- continue;
- if (avail > len)
- avail = len;
- mb();
-
- memcpy(dst, data, avail);
- data += avail;
- len -= avail;
-
- /* Other side must not see new header until data is there. */
- wmb();
- intf->req_prod += avail;
-
- /* This implies mb() before other side sees interrupt. */
- notify_remote_via_evtchn(xen_store_evtchn);
- }
-
- return (0);
-}
-
-int
-xb_read(void *tdata, unsigned len, struct lock_object *lock)
-{
- struct xenstore_domain_interface *intf = xenstore_domain_interface();
- XENSTORE_RING_IDX cons, prod;
- char *data = (char *)tdata;
- int error;
-
- while (len != 0) {
- unsigned int avail;
- const char *src;
-
- while (intf->rsp_cons == intf->rsp_prod) {
- error = _sleep(intf, lock,
- PCATCH, "xbread", hz/10);
- if (error && error != EWOULDBLOCK)
- return (error);
- }
-
- /* Read indexes, then verify. */
- cons = intf->rsp_cons;
- prod = intf->rsp_prod;
- if (!xb_check_indexes(cons, prod)) {
- intf->rsp_cons = intf->rsp_prod = 0;
- return (EIO);
- }
-
- src = xb_get_input_chunk(cons, prod, intf->rsp, &avail);
- if (avail == 0)
- continue;
- if (avail > len)
- avail = len;
-
- /* We must read header before we read data. */
- rmb();
-
- memcpy(data, src, avail);
- data += avail;
- len -= avail;
-
- /* Other side must not see free space until we've copied out */
- mb();
- intf->rsp_cons += avail;
-
- /* Implies mb(): they will see new header. */
- notify_remote_via_evtchn(xen_store_evtchn);
- }
-
- return (0);
-}
-
-/* Set up interrupt handler off store event channel. */
-int
-xb_init_comms(void)
-{
- struct xenstore_domain_interface *intf = xenstore_domain_interface();
- int error;
-
- if (intf->rsp_prod != intf->rsp_cons) {
- log(LOG_WARNING, "XENBUS response ring is not quiescent "
- "(%08x:%08x): fixing up\n",
- intf->rsp_cons, intf->rsp_prod);
- intf->rsp_cons = intf->rsp_prod;
- }
-
- if (xenstore_irq)
- unbind_from_irqhandler(xenstore_irq);
-
- error = bind_caller_port_to_irqhandler(
- xen_store_evtchn, "xenbus",
- xb_intr, NULL, INTR_TYPE_NET, &xenstore_irq);
- if (error) {
- log(LOG_WARNING, "XENBUS request irq failed %i\n", error);
- return (error);
- }
-
- return (0);
-}
diff --git a/sys/xen/xenbus/xenbus_comms.h b/sys/xen/xenbus/xenbus_comms.h
deleted file mode 100644
index fa47331..0000000
--- a/sys/xen/xenbus/xenbus_comms.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Private include for xenbus communications.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef _XENBUS_COMMS_H
-#define _XENBUS_COMMS_H
-
-struct sx;
-extern int xen_store_evtchn;
-extern char *xen_store;
-
-int xs_init(void);
-int xb_init_comms(void);
-
-/* Low level routines. */
-int xb_write(const void *data, unsigned len, struct lock_object *);
-int xb_read(void *data, unsigned len, struct lock_object *);
-extern int xenbus_running;
-
-char *kasprintf(const char *fmt, ...);
-
-
-#endif /* _XENBUS_COMMS_H */
diff --git a/sys/xen/xenbus/xenbus_if.m b/sys/xen/xenbus/xenbus_if.m
index 018a2bb..d671418 100644
--- a/sys/xen/xenbus/xenbus_if.m
+++ b/sys/xen/xenbus/xenbus_if.m
@@ -31,7 +31,15 @@
INTERFACE xenbus;
-METHOD int backend_changed {
- device_t dev;
- enum xenbus_state newstate;
+/**
+ * \brief Callback triggered when the state of the otherend
+ * of a split device changes.
+ *
+ * \param _dev NewBus device_t for this XenBus device whose otherend's
+ * state has changed..
+ * \param _newstate The new state of the otherend device.
+ */
+METHOD int otherend_changed {
+ device_t _dev;
+ enum xenbus_state _newstate;
};
diff --git a/sys/xen/xenbus/xenbus_probe.c b/sys/xen/xenbus/xenbus_probe.c
deleted file mode 100644
index b1e9a21..0000000
--- a/sys/xen/xenbus/xenbus_probe.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/******************************************************************************
- * Talks to Xen Store to figure out what devices we have.
- *
- * Copyright (C) 2008 Doug Rabson
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * Copyright (C) 2005 Mike Wray, Hewlett-Packard
- * Copyright (C) 2005 XenSource Ltd
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#if 0
-#define DPRINTK(fmt, args...) \
- printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTK(fmt, args...) ((void)0)
-#endif
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/sx.h>
-#include <sys/taskqueue.h>
-
-#include <machine/xen/xen-os.h>
-#include <machine/stdarg.h>
-
-#include <xen/gnttab.h>
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-struct xenbus_softc {
- struct xenbus_watch xs_devicewatch;
- struct task xs_probechildren;
- struct intr_config_hook xs_attachcb;
- device_t xs_dev;
-};
-
-struct xenbus_device_ivars {
- struct xenbus_watch xd_otherend_watch; /* must be first */
- struct sx xd_lock;
- device_t xd_dev;
- char *xd_node; /* node name in xenstore */
- char *xd_type; /* xen device type */
- enum xenbus_state xd_state;
- int xd_otherend_id;
- char *xd_otherend_path;
-};
-
-/* Simplified asprintf. */
-char *
-kasprintf(const char *fmt, ...)
-{
- va_list ap;
- unsigned int len;
- char *p, dummy[1];
-
- va_start(ap, fmt);
- /* FIXME: vsnprintf has a bug, NULL should work */
- len = vsnprintf(dummy, 0, fmt, ap);
- va_end(ap);
-
- p = malloc(len + 1, M_DEVBUF, M_WAITOK);
- va_start(ap, fmt);
- vsprintf(p, fmt, ap);
- va_end(ap);
- return p;
-}
-
-static void
-xenbus_identify(driver_t *driver, device_t parent)
-{
-
- BUS_ADD_CHILD(parent, 0, "xenbus", 0);
-}
-
-static int
-xenbus_probe(device_t dev)
-{
- int err = 0;
-
- DPRINTK("");
-
- /* Initialize the interface to xenstore. */
- err = xs_init();
- if (err) {
- log(LOG_WARNING,
- "XENBUS: Error initializing xenstore comms: %i\n", err);
- return (ENXIO);
- }
- err = gnttab_init();
- if (err) {
- log(LOG_WARNING,
- "XENBUS: Error initializing grant table: %i\n", err);
- return (ENXIO);
- }
- device_set_desc(dev, "Xen Devices");
-
- return (0);
-}
-
-static enum xenbus_state
-xenbus_otherend_state(struct xenbus_device_ivars *ivars)
-{
-
- return (xenbus_read_driver_state(ivars->xd_otherend_path));
-}
-
-static void
-xenbus_backend_changed(struct xenbus_watch *watch, const char **vec,
- unsigned int len)
-{
- struct xenbus_device_ivars *ivars;
- device_t dev;
- enum xenbus_state newstate;
-
- ivars = (struct xenbus_device_ivars *) watch;
- dev = ivars->xd_dev;
-
- if (!ivars->xd_otherend_path
- || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH],
- strlen(ivars->xd_otherend_path)))
- return;
-
- newstate = xenbus_otherend_state(ivars);
- XENBUS_BACKEND_CHANGED(dev, newstate);
-}
-
-static int
-xenbus_device_exists(device_t dev, const char *node)
-{
- device_t *kids;
- struct xenbus_device_ivars *ivars;
- int i, count, result;
-
- if (device_get_children(dev, &kids, &count))
- return (FALSE);
-
- result = FALSE;
- for (i = 0; i < count; i++) {
- ivars = device_get_ivars(kids[i]);
- if (!strcmp(ivars->xd_node, node)) {
- result = TRUE;
- break;
- }
- }
- free(kids, M_TEMP);
-
- return (result);
-}
-
-static int
-xenbus_add_device(device_t dev, const char *bus,
- const char *type, const char *id)
-{
- device_t child;
- struct xenbus_device_ivars *ivars;
- enum xenbus_state state;
- char *statepath;
- int error;
-
- ivars = malloc(sizeof(struct xenbus_device_ivars),
- M_DEVBUF, M_ZERO|M_WAITOK);
- ivars->xd_node = kasprintf("%s/%s/%s", bus, type, id);
-
- if (xenbus_device_exists(dev, ivars->xd_node)) {
- /*
- * We are already tracking this node
- */
- free(ivars->xd_node, M_DEVBUF);
- free(ivars, M_DEVBUF);
- return (0);
- }
-
- state = xenbus_read_driver_state(ivars->xd_node);
-
- if (state != XenbusStateInitialising) {
- /*
- * Device is not new, so ignore it. This can
- * happen if a device is going away after
- * switching to Closed.
- */
- free(ivars->xd_node, M_DEVBUF);
- free(ivars, M_DEVBUF);
- return (0);
- }
-
- /*
- * Find the backend details
- */
- error = xenbus_gather(XBT_NIL, ivars->xd_node,
- "backend-id", "%i", &ivars->xd_otherend_id,
- "backend", NULL, &ivars->xd_otherend_path,
- NULL);
- if (error)
- return (error);
-
- sx_init(&ivars->xd_lock, "xdlock");
- ivars->xd_type = strdup(type, M_DEVBUF);
- ivars->xd_state = XenbusStateInitialising;
-
- statepath = malloc(strlen(ivars->xd_otherend_path)
- + strlen("/state") + 1, M_DEVBUF, M_WAITOK);
- sprintf(statepath, "%s/state", ivars->xd_otherend_path);
-
- ivars->xd_otherend_watch.node = statepath;
- ivars->xd_otherend_watch.callback = xenbus_backend_changed;
-
- child = device_add_child(dev, NULL, -1);
- ivars->xd_dev = child;
- device_set_ivars(child, ivars);
-
- return (0);
-}
-
-static int
-xenbus_enumerate_type(device_t dev, const char *bus, const char *type)
-{
- char **dir;
- unsigned int i, count;
- int error;
-
- error = xenbus_directory(XBT_NIL, bus, type, &count, &dir);
- if (error)
- return (error);
- for (i = 0; i < count; i++)
- xenbus_add_device(dev, bus, type, dir[i]);
-
- free(dir, M_DEVBUF);
-
- return (0);
-}
-
-static int
-xenbus_enumerate_bus(device_t dev, const char *bus)
-{
- char **dir;
- unsigned int i, count;
- int error;
-
- error = xenbus_directory(XBT_NIL, bus, "", &count, &dir);
- if (error)
- return (error);
- for (i = 0; i < count; i++) {
- xenbus_enumerate_type(dev, bus, dir[i]);
- }
- free(dir, M_DEVBUF);
-
- return (0);
-}
-
-static int
-xenbus_probe_children(device_t dev)
-{
- device_t *kids;
- struct xenbus_device_ivars *ivars;
- int i, count;
-
- /*
- * Probe any new devices and register watches for any that
- * attach successfully. Since part of the protocol which
- * establishes a connection with the other end is interrupt
- * driven, we sleep until the device reaches a stable state
- * (closed or connected).
- */
- if (device_get_children(dev, &kids, &count) == 0) {
- for (i = 0; i < count; i++) {
- if (device_get_state(kids[i]) != DS_NOTPRESENT)
- continue;
-
- if (device_probe_and_attach(kids[i]))
- continue;
- ivars = device_get_ivars(kids[i]);
- register_xenbus_watch(
- &ivars->xd_otherend_watch);
- sx_xlock(&ivars->xd_lock);
- while (ivars->xd_state != XenbusStateClosed
- && ivars->xd_state != XenbusStateConnected)
- sx_sleep(&ivars->xd_state, &ivars->xd_lock,
- 0, "xdattach", 0);
- sx_xunlock(&ivars->xd_lock);
- }
- free(kids, M_TEMP);
- }
-
- return (0);
-}
-
-static void
-xenbus_probe_children_cb(void *arg, int pending)
-{
- device_t dev = (device_t) arg;
-
- xenbus_probe_children(dev);
-}
-
-static void
-xenbus_devices_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- struct xenbus_softc *sc = (struct xenbus_softc *) watch;
- device_t dev = sc->xs_dev;
- char *node, *bus, *type, *id, *p;
-
- node = strdup(vec[XS_WATCH_PATH], M_DEVBUF);
- p = strchr(node, '/');
- if (!p)
- goto out;
- bus = node;
- *p = 0;
- type = p + 1;
-
- p = strchr(type, '/');
- if (!p)
- goto out;
- *p = 0;
- id = p + 1;
-
- p = strchr(id, '/');
- if (p)
- *p = 0;
-
- xenbus_add_device(dev, bus, type, id);
- taskqueue_enqueue(taskqueue_thread, &sc->xs_probechildren);
-out:
- free(node, M_DEVBUF);
-}
-
-static void
-xenbus_attach_deferred(void *arg)
-{
- device_t dev = (device_t) arg;
- struct xenbus_softc *sc = device_get_softc(dev);
- int error;
-
- error = xenbus_enumerate_bus(dev, "device");
- if (error)
- return;
- xenbus_probe_children(dev);
-
- sc->xs_dev = dev;
- sc->xs_devicewatch.node = "device";
- sc->xs_devicewatch.callback = xenbus_devices_changed;
-
- TASK_INIT(&sc->xs_probechildren, 0, xenbus_probe_children_cb, dev);
-
- register_xenbus_watch(&sc->xs_devicewatch);
-
- config_intrhook_disestablish(&sc->xs_attachcb);
-}
-
-static int
-xenbus_attach(device_t dev)
-{
- struct xenbus_softc *sc = device_get_softc(dev);
-
- sc->xs_attachcb.ich_func = xenbus_attach_deferred;
- sc->xs_attachcb.ich_arg = dev;
- config_intrhook_establish(&sc->xs_attachcb);
-
- return (0);
-}
-
-static int
-xenbus_suspend(device_t dev)
-{
- int error;
-
- DPRINTK("");
-
- error = bus_generic_suspend(dev);
- if (error)
- return (error);
-
- xs_suspend();
-
- return (0);
-}
-
-static int
-xenbus_resume(device_t dev)
-{
- device_t *kids;
- struct xenbus_device_ivars *ivars;
- int i, count, error;
- char *statepath;
-
- xb_init_comms();
- xs_resume();
-
- /*
- * We must re-examine each device and find the new path for
- * its backend.
- */
- if (device_get_children(dev, &kids, &count) == 0) {
- for (i = 0; i < count; i++) {
- if (device_get_state(kids[i]) == DS_NOTPRESENT)
- continue;
-
- ivars = device_get_ivars(kids[i]);
-
- unregister_xenbus_watch(
- &ivars->xd_otherend_watch);
- ivars->xd_state = XenbusStateInitialising;
-
- /*
- * Find the new backend details and
- * re-register our watch.
- */
- free(ivars->xd_otherend_path, M_DEVBUF);
- error = xenbus_gather(XBT_NIL, ivars->xd_node,
- "backend-id", "%i", &ivars->xd_otherend_id,
- "backend", NULL, &ivars->xd_otherend_path,
- NULL);
- if (error)
- return (error);
-
- DEVICE_RESUME(kids[i]);
-
- statepath = malloc(strlen(ivars->xd_otherend_path)
- + strlen("/state") + 1, M_DEVBUF, M_WAITOK);
- sprintf(statepath, "%s/state", ivars->xd_otherend_path);
-
- free(ivars->xd_otherend_watch.node, M_DEVBUF);
- ivars->xd_otherend_watch.node = statepath;
- register_xenbus_watch(
- &ivars->xd_otherend_watch);
-
-#if 0
- /*
- * Can't do this yet since we are running in
- * the xenwatch thread and if we sleep here,
- * we will stop delivering watch notifications
- * and the device will never come back online.
- */
- sx_xlock(&ivars->xd_lock);
- while (ivars->xd_state != XenbusStateClosed
- && ivars->xd_state != XenbusStateConnected)
- sx_sleep(&ivars->xd_state, &ivars->xd_lock,
- 0, "xdresume", 0);
- sx_xunlock(&ivars->xd_lock);
-#endif
- }
- free(kids, M_TEMP);
- }
-
- return (0);
-}
-
-static int
-xenbus_print_child(device_t dev, device_t child)
-{
- struct xenbus_device_ivars *ivars = device_get_ivars(child);
- int retval = 0;
-
- retval += bus_print_child_header(dev, child);
- retval += printf(" at %s", ivars->xd_node);
- retval += bus_print_child_footer(dev, child);
-
- return (retval);
-}
-
-static int
-xenbus_read_ivar(device_t dev, device_t child, int index,
- uintptr_t * result)
-{
- struct xenbus_device_ivars *ivars = device_get_ivars(child);
-
- switch (index) {
- case XENBUS_IVAR_NODE:
- *result = (uintptr_t) ivars->xd_node;
- return (0);
-
- case XENBUS_IVAR_TYPE:
- *result = (uintptr_t) ivars->xd_type;
- return (0);
-
- case XENBUS_IVAR_STATE:
- *result = (uintptr_t) ivars->xd_state;
- return (0);
-
- case XENBUS_IVAR_OTHEREND_ID:
- *result = (uintptr_t) ivars->xd_otherend_id;
- return (0);
-
- case XENBUS_IVAR_OTHEREND_PATH:
- *result = (uintptr_t) ivars->xd_otherend_path;
- return (0);
- }
-
- return (ENOENT);
-}
-
-static int
-xenbus_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
-{
- struct xenbus_device_ivars *ivars = device_get_ivars(child);
- enum xenbus_state newstate;
- int currstate;
- int error;
-
- switch (index) {
- case XENBUS_IVAR_STATE:
- newstate = (enum xenbus_state) value;
- sx_xlock(&ivars->xd_lock);
- if (ivars->xd_state == newstate)
- goto out;
-
- error = xenbus_scanf(XBT_NIL, ivars->xd_node, "state",
- NULL, "%d", &currstate);
- if (error)
- goto out;
-
- error = xenbus_printf(XBT_NIL, ivars->xd_node, "state",
- "%d", newstate);
- if (error) {
- if (newstate != XenbusStateClosing) /* Avoid looping */
- xenbus_dev_fatal(dev, error, "writing new state");
- goto out;
- }
- ivars->xd_state = newstate;
- wakeup(&ivars->xd_state);
- out:
- sx_xunlock(&ivars->xd_lock);
- return (0);
-
- case XENBUS_IVAR_NODE:
- case XENBUS_IVAR_TYPE:
- case XENBUS_IVAR_OTHEREND_ID:
- case XENBUS_IVAR_OTHEREND_PATH:
- /*
- * These variables are read-only.
- */
- return (EINVAL);
- }
-
- return (ENOENT);
-}
-
-SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
-SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xen_store_evtchn, 0, "");
-SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
-
-static device_method_t xenbus_methods[] = {
- /* Device interface */
- DEVMETHOD(device_identify, xenbus_identify),
- DEVMETHOD(device_probe, xenbus_probe),
- DEVMETHOD(device_attach, xenbus_attach),
- DEVMETHOD(device_detach, bus_generic_detach),
- DEVMETHOD(device_shutdown, bus_generic_shutdown),
- DEVMETHOD(device_suspend, xenbus_suspend),
- DEVMETHOD(device_resume, xenbus_resume),
-
- /* Bus interface */
- DEVMETHOD(bus_print_child, xenbus_print_child),
- DEVMETHOD(bus_read_ivar, xenbus_read_ivar),
- DEVMETHOD(bus_write_ivar, xenbus_write_ivar),
-
- { 0, 0 }
-};
-
-static char driver_name[] = "xenbus";
-static driver_t xenbus_driver = {
- driver_name,
- xenbus_methods,
- sizeof(struct xenbus_softc),
-};
-devclass_t xenbus_devclass;
-
-#ifdef XENHVM
-DRIVER_MODULE(xenbus, xenpci, xenbus_driver, xenbus_devclass, 0, 0);
-#else
-DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0);
-#endif
diff --git a/sys/xen/xenbus/xenbus_probe_backend.c b/sys/xen/xenbus/xenbus_probe_backend.c
deleted file mode 100644
index 20cc49f..0000000
--- a/sys/xen/xenbus/xenbus_probe_backend.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/******************************************************************************
- * Talks to Xen Store to figure out what devices we have (backend half).
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * Copyright (C) 2005 Mike Wray, Hewlett-Packard
- * Copyright (C) 2005, 2006 XenSource Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#if 0
-#define DPRINTK(fmt, args...) \
- printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTK(fmt, args...) ((void)0)
-#endif
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/cdefs.h>
-#include <sys/time.h>
-#include <sys/sema.h>
-#include <sys/eventhandler.h>
-#include <sys/errno.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/conf.h>
-#include <sys/systm.h>
-#include <sys/syslog.h>
-#include <sys/proc.h>
-#include <sys/bus.h>
-#include <sys/sx.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <machine/xen/xenbus.h>
-#include <machine/stdarg.h>
-
-#include <xen/evtchn.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-#define BUG_ON PANIC_IF
-#define semaphore sema
-#define rw_semaphore sema
-#define DEFINE_SPINLOCK(lock) struct mtx lock
-#define DECLARE_MUTEX(lock) struct sema lock
-#define u32 uint32_t
-#define list_del(head, ent) TAILQ_REMOVE(head, ent, list)
-#define simple_strtoul strtoul
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-#define list_empty TAILQ_EMPTY
-
-extern struct xendev_list_head xenbus_device_backend_list;
-#if 0
-static int xenbus_uevent_backend(struct device *dev, char **envp,
- int num_envp, char *buffer, int buffer_size);
-#endif
-static int xenbus_probe_backend(const char *type, const char *domid);
-
-static int read_frontend_details(struct xenbus_device *xendev)
-{
- return read_otherend_details(xendev, "frontend-id", "frontend");
-}
-
-/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
-static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
-{
- int domid, err;
- const char *devid, *type, *frontend;
- unsigned int typelen;
-
- type = strchr(nodename, '/');
- if (!type)
- return -EINVAL;
- type++;
- typelen = strcspn(type, "/");
- if (!typelen || type[typelen] != '/')
- return -EINVAL;
-
- devid = strrchr(nodename, '/') + 1;
-
- err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
- "frontend", NULL, &frontend,
- NULL);
- if (err)
- return err;
- if (strlen(frontend) == 0)
- err = -ERANGE;
- if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
- err = -ENOENT;
- kfree(frontend);
-
- if (err)
- return err;
-
- if (snprintf(bus_id, BUS_ID_SIZE,
- "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
- return -ENOSPC;
- return 0;
-}
-
-static struct xen_bus_type xenbus_backend = {
- .root = "backend",
- .levels = 3, /* backend/type/<frontend>/<id> */
- .get_bus_id = backend_bus_id,
- .probe = xenbus_probe_backend,
- .bus = &xenbus_device_backend_list,
-
-#if 0
- .error = -ENODEV,
- .bus = {
- .name = "xen-backend",
- .match = xenbus_match,
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
-// .shutdown = xenbus_dev_shutdown,
- .uevent = xenbus_uevent_backend,
- },
- .dev = {
- .bus_id = "xen-backend",
- },
-#endif
-};
-
-#if 0
-static int xenbus_uevent_backend(struct device *dev, char **envp,
- int num_envp, char *buffer, int buffer_size)
-{
- struct xenbus_device *xdev;
- struct xenbus_driver *drv;
- int i = 0;
- int length = 0;
-
- DPRINTK("");
-
- if (dev == NULL)
- return -ENODEV;
-
- xdev = to_xenbus_device(dev);
- if (xdev == NULL)
- return -ENODEV;
-2
- /* stuff we want to pass to /sbin/hotplug */
- add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
- "XENBUS_TYPE=%s", xdev->devicetype);
-
- add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
- "XENBUS_PATH=%s", xdev->nodename);
-
- add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
- "XENBUS_BASE_PATH=%s", xenbus_backend.root);
-
- /* terminate, set to next free slot, shrink available space */
- envp[i] = NULL;
- envp = &envp[i];
- num_envp -= i;
- buffer = &buffer[length];
- buffer_size -= length;
-
- if (dev->driver) {
- drv = to_xenbus_driver(dev->driver);
- if (drv && drv->uevent)
- return drv->uevent(xdev, envp, num_envp, buffer,
- buffer_size);
- }
-
- return 0;
-}
-#endif
-
-int xenbus_register_backend(struct xenbus_driver *drv)
-{
- drv->read_otherend_details = read_frontend_details;
-
- return xenbus_register_driver_common(drv, &xenbus_backend);
-}
-
-/* backend/<typename>/<frontend-uuid>/<name> */
-static int xenbus_probe_backend_unit(const char *dir,
- const char *type,
- const char *name)
-{
- char *nodename;
- int err;
-
- nodename = kasprintf("%s/%s", dir, name);
- if (!nodename)
- return -ENOMEM;
-
- DPRINTK("%s\n", nodename);
-
- err = xenbus_probe_node(&xenbus_backend, type, nodename);
- kfree(nodename);
- return err;
-}
-
-/* backend/<typename>/<frontend-domid> */
-static int xenbus_probe_backend(const char *type, const char *domid)
-{
- char *nodename;
- int err = 0;
- char **dir;
- unsigned int i, dir_n = 0;
-
- DPRINTK("");
-
- nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
- if (!nodename)
- return -ENOMEM;
-
- dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
- if (IS_ERR(dir)) {
- kfree(nodename);
- return PTR_ERR(dir);
- }
-
- for (i = 0; i < dir_n; i++) {
- err = xenbus_probe_backend_unit(nodename, type, dir[i]);
- if (err)
- break;
- }
- kfree(dir);
- kfree(nodename);
- return err;
-}
-
-static void backend_changed(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- DPRINTK("");
-
- dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
-}
-
-static struct xenbus_watch be_watch = {
- .node = "backend",
- .callback = backend_changed,
-};
-#if 0
-void xenbus_backend_suspend(int (*fn)(struct device *, void *))
-{
- DPRINTK("");
- if (!xenbus_backend.error)
- bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
-}
-
-void xenbus_backend_resume(int (*fn)(struct device *, void *))
-{
- DPRINTK("");
- if (!xenbus_backend.error)
- bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
-}
-#endif
-void xenbus_backend_probe_and_watch(void)
-{
- xenbus_probe_devices(&xenbus_backend);
- register_xenbus_watch(&be_watch);
-}
-
-#if 0
-void xenbus_backend_bus_register(void)
-{
- xenbus_backend.error = bus_register(&xenbus_backend.bus);
- if (xenbus_backend.error)
- log(LOG_WARNING,
- "XENBUS: Error registering backend bus: %i\n",
- xenbus_backend.error);
-}
-
-void xenbus_backend_device_register(void)
-{
- if (xenbus_backend.error)
- return;
-
- xenbus_backend.error = device_register(&xenbus_backend.dev);
- if (xenbus_backend.error) {
- bus_unregister(&xenbus_backend.bus);
- log(LOG_WARNING,
- "XENBUS: Error registering backend device: %i\n",
- xenbus_backend.error);
- }
-}
-#endif
diff --git a/sys/xen/xenbus/xenbus_xs.c b/sys/xen/xenbus/xenbus_xs.c
deleted file mode 100644
index 9312255..0000000
--- a/sys/xen/xenbus/xenbus_xs.c
+++ /dev/null
@@ -1,935 +0,0 @@
-/******************************************************************************
- * xenbus_xs.c
- *
- * This is the kernel equivalent of the "xs" library. We don't need everything
- * and we use xenbus_comms for communication.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/uio.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/sx.h>
-#include <sys/syslog.h>
-#include <sys/malloc.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/kthread.h>
-#include <sys/unistd.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <machine/stdarg.h>
-
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
-#include <xen/interface/hvm/params.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-static int xs_process_msg(enum xsd_sockmsg_type *type);
-
-int xenwatch_running = 0;
-int xenbus_running = 0;
-int xen_store_evtchn;
-
-struct xs_stored_msg {
- TAILQ_ENTRY(xs_stored_msg) list;
-
- struct xsd_sockmsg hdr;
-
- union {
- /* Queued replies. */
- struct {
- char *body;
- } reply;
-
- /* Queued watch events. */
- struct {
- struct xenbus_watch *handle;
- char **vec;
- unsigned int vec_size;
- } watch;
- } u;
-};
-
-struct xs_handle {
- /* A list of replies. Currently only one will ever be outstanding. */
- TAILQ_HEAD(xs_handle_list, xs_stored_msg) reply_list;
- struct mtx reply_lock;
- int reply_waitq;
-
- /* One request at a time. */
- struct sx request_mutex;
-
- /* Protect transactions against save/restore. */
- struct sx suspend_mutex;
-};
-
-static struct xs_handle xs_state;
-
-/* List of registered watches, and a lock to protect it. */
-static LIST_HEAD(watch_list_head, xenbus_watch) watches;
-static struct mtx watches_lock;
-/* List of pending watch callback events, and a lock to protect it. */
-static TAILQ_HEAD(event_list_head, xs_stored_msg) watch_events;
-static struct mtx watch_events_lock;
-
-/*
- * Details of the xenwatch callback kernel thread. The thread waits on the
- * watch_events_waitq for work to do (queued on watch_events list). When it
- * wakes up it acquires the xenwatch_mutex before reading the list and
- * carrying out work.
- */
-static pid_t xenwatch_pid;
-struct sx xenwatch_mutex;
-static int watch_events_waitq;
-
-#define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0]))
-
-static int
-xs_get_error(const char *errorstring)
-{
- unsigned int i;
-
- for (i = 0; i < xsd_error_count; i++) {
- if (!strcmp(errorstring, xsd_errors[i].errstring))
- return (xsd_errors[i].errnum);
- }
- log(LOG_WARNING, "XENBUS xen store gave: unknown error %s",
- errorstring);
- return (EINVAL);
-}
-
-extern void kdb_backtrace(void);
-
-static int
-xs_read_reply(enum xsd_sockmsg_type *type, unsigned int *len, void **result)
-{
- struct xs_stored_msg *msg;
- char *body;
- int error;
-
- mtx_lock(&xs_state.reply_lock);
-
- while (TAILQ_EMPTY(&xs_state.reply_list)) {
- while (TAILQ_EMPTY(&xs_state.reply_list)) {
- error = mtx_sleep(&xs_state.reply_waitq,
- &xs_state.reply_lock,
- PCATCH, "xswait", hz/10);
- if (error && error != EWOULDBLOCK) {
- mtx_unlock(&xs_state.reply_lock);
- return (error);
- }
- }
- }
-
- msg = TAILQ_FIRST(&xs_state.reply_list);
- TAILQ_REMOVE(&xs_state.reply_list, msg, list);
-
- mtx_unlock(&xs_state.reply_lock);
-
- *type = msg->hdr.type;
- if (len)
- *len = msg->hdr.len;
- body = msg->u.reply.body;
-
- free(msg, M_DEVBUF);
- *result = body;
- return (0);
-}
-
-#if 0
-/* Emergency write. UNUSED*/
-void xenbus_debug_write(const char *str, unsigned int count)
-{
- struct xsd_sockmsg msg = { 0 };
-
- msg.type = XS_DEBUG;
- msg.len = sizeof("print") + count + 1;
-
- sx_xlock(&xs_state.request_mutex);
- xb_write(&msg, sizeof(msg));
- xb_write("print", sizeof("print"));
- xb_write(str, count);
- xb_write("", 1);
- sx_xunlock(&xs_state.request_mutex);
-}
-
-#endif
-
-int
-xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
-{
- struct xsd_sockmsg req_msg = *msg;
- int error;
-
- if (req_msg.type == XS_TRANSACTION_START)
- sx_slock(&xs_state.suspend_mutex);
-
- sx_xlock(&xs_state.request_mutex);
-
- error = xb_write(msg, sizeof(*msg) + msg->len,
- &xs_state.request_mutex.lock_object);
- if (error) {
- msg->type = XS_ERROR;
- } else {
- error = xs_read_reply(&msg->type, &msg->len, result);
- }
-
- sx_xunlock(&xs_state.request_mutex);
-
- if ((msg->type == XS_TRANSACTION_END) ||
- ((req_msg.type == XS_TRANSACTION_START) &&
- (msg->type == XS_ERROR)))
- sx_sunlock(&xs_state.suspend_mutex);
-
- return (error);
-}
-
-/*
- * Send message to xs. The reply is returned in *result and should be
- * fred with free(*result, M_DEVBUF). Return zero on success or an
- * error code on failure.
- */
-static int
-xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type,
- const struct iovec *iovec, unsigned int num_vecs,
- unsigned int *len, void **result)
-{
- struct xsd_sockmsg msg;
- void *ret = NULL;
- unsigned int i;
- int error;
-
- msg.tx_id = t.id;
- msg.req_id = 0;
- msg.type = type;
- msg.len = 0;
- for (i = 0; i < num_vecs; i++)
- msg.len += iovec[i].iov_len;
-
- sx_xlock(&xs_state.request_mutex);
-
- error = xb_write(&msg, sizeof(msg),
- &xs_state.request_mutex.lock_object);
- if (error) {
- sx_xunlock(&xs_state.request_mutex);
- printf("xs_talkv failed %d\n", error);
- return (error);
- }
-
- for (i = 0; i < num_vecs; i++) {
- error = xb_write(iovec[i].iov_base, iovec[i].iov_len,
- &xs_state.request_mutex.lock_object);
- if (error) {
- sx_xunlock(&xs_state.request_mutex);
- printf("xs_talkv failed %d\n", error);
- return (error);
- }
- }
-
- error = xs_read_reply(&msg.type, len, &ret);
-
- sx_xunlock(&xs_state.request_mutex);
-
- if (error)
- return (error);
-
- if (msg.type == XS_ERROR) {
- error = xs_get_error(ret);
- free(ret, M_DEVBUF);
- return (error);
- }
-
-#if 0
- if ((xenwatch_running == 0) && (xenwatch_inline == 0)) {
- xenwatch_inline = 1;
- while (!TAILQ_EMPTY(&watch_events)
- && xenwatch_running == 0) {
-
- struct xs_stored_msg *wmsg = TAILQ_FIRST(&watch_events);
- TAILQ_REMOVE(&watch_events, wmsg, list);
-
- wmsg->u.watch.handle->callback(
- wmsg->u.watch.handle,
- (const char **)wmsg->u.watch.vec,
- wmsg->u.watch.vec_size);
- free(wmsg->u.watch.vec, M_DEVBUF);
- free(wmsg, M_DEVBUF);
- }
- xenwatch_inline = 0;
- }
-#endif
- KASSERT(msg.type == type, ("bad xenstore message type"));
-
- if (result)
- *result = ret;
- else
- free(ret, M_DEVBUF);
-
- return (0);
-}
-
-/* Simplified version of xs_talkv: single message. */
-static int
-xs_single(struct xenbus_transaction t, enum xsd_sockmsg_type type,
- const char *string, unsigned int *len, void **result)
-{
- struct iovec iovec;
-
- iovec.iov_base = (void *)(uintptr_t) string;
- iovec.iov_len = strlen(string) + 1;
-
- return (xs_talkv(t, type, &iovec, 1, len, result));
-}
-
-static unsigned int
-count_strings(const char *strings, unsigned int len)
-{
- unsigned int num;
- const char *p;
-
- for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
- num++;
-
- return num;
-}
-
-/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
-static char *
-join(const char *dir, const char *name)
-{
- char *buffer;
-
- buffer = malloc(strlen(dir) + strlen("/") + strlen(name) + 1,
- M_DEVBUF, M_WAITOK);
-
- strcpy(buffer, dir);
- if (strcmp(name, "")) {
- strcat(buffer, "/");
- strcat(buffer, name);
- }
-
- return (buffer);
-}
-
-static char **
-split(char *strings, unsigned int len, unsigned int *num)
-{
- char *p, **ret;
-
- /* Count the strings. */
- *num = count_strings(strings, len) + 1;
-
- /* Transfer to one big alloc for easy freeing. */
- ret = malloc(*num * sizeof(char *) + len, M_DEVBUF, M_WAITOK);
- memcpy(&ret[*num], strings, len);
- free(strings, M_DEVBUF);
-
- strings = (char *)&ret[*num];
- for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
- ret[(*num)++] = p;
-
- ret[*num] = strings + len;
-
- return ret;
-}
-
-/*
- * Return the contents of a directory in *result which should be freed
- * with free(*result, M_DEVBUF).
- */
-int
-xenbus_directory(struct xenbus_transaction t, const char *dir,
- const char *node, unsigned int *num, char ***result)
-{
- char *strings, *path;
- unsigned int len = 0;
- int error;
-
- path = join(dir, node);
- error = xs_single(t, XS_DIRECTORY, path, &len, (void **) &strings);
- free(path, M_DEVBUF);
- if (error)
- return (error);
-
- *result = split(strings, len, num);
- return (0);
-}
-
-/*
- * Check if a path exists. Return 1 if it does.
- */
-int
-xenbus_exists(struct xenbus_transaction t, const char *dir, const char *node)
-{
- char **d;
- int error, dir_n;
-
- error = xenbus_directory(t, dir, node, &dir_n, &d);
- if (error)
- return (0);
- free(d, M_DEVBUF);
- return (1);
-}
-
-/*
- * Get the value of a single file. Returns the contents in *result
- * which should be freed with free(*result, M_DEVBUF) after use.
- * The length of the value in bytes is returned in *len.
- */
-int
-xenbus_read(struct xenbus_transaction t, const char *dir, const char *node,
- unsigned int *len, void **result)
-{
- char *path;
- void *ret;
- int error;
-
- path = join(dir, node);
- error = xs_single(t, XS_READ, path, len, &ret);
- free(path, M_DEVBUF);
- if (error)
- return (error);
- *result = ret;
- return (0);
-}
-
-/*
- * Write the value of a single file. Returns error on failure.
- */
-int
-xenbus_write(struct xenbus_transaction t, const char *dir, const char *node,
- const char *string)
-{
- char *path;
- struct iovec iovec[2];
- int error;
-
- path = join(dir, node);
-
- iovec[0].iov_base = (void *)(uintptr_t) path;
- iovec[0].iov_len = strlen(path) + 1;
- iovec[1].iov_base = (void *)(uintptr_t) string;
- iovec[1].iov_len = strlen(string);
-
- error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
- free(path, M_DEVBUF);
-
- return (error);
-}
-
-/*
- * Create a new directory.
- */
-int
-xenbus_mkdir(struct xenbus_transaction t, const char *dir, const char *node)
-{
- char *path;
- int ret;
-
- path = join(dir, node);
- ret = xs_single(t, XS_MKDIR, path, NULL, NULL);
- free(path, M_DEVBUF);
-
- return (ret);
-}
-
-/*
- * Destroy a file or directory (directories must be empty).
- */
-int
-xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
-{
- char *path;
- int ret;
-
- path = join(dir, node);
- ret = xs_single(t, XS_RM, path, NULL, NULL);
- free(path, M_DEVBUF);
-
- return (ret);
-}
-
-/*
- * Start a transaction: changes by others will not be seen during this
- * transaction, and changes will not be visible to others until end.
- */
-int
-xenbus_transaction_start(struct xenbus_transaction *t)
-{
- char *id_str;
- int error;
-
- sx_slock(&xs_state.suspend_mutex);
- error = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL,
- (void **) &id_str);
- if (error) {
- sx_sunlock(&xs_state.suspend_mutex);
- return (error);
- }
-
- t->id = strtoul(id_str, NULL, 0);
- free(id_str, M_DEVBUF);
-
- return (0);
-}
-
-/*
- * End a transaction. If abandon is true, transaction is discarded
- * instead of committed.
- */
-int xenbus_transaction_end(struct xenbus_transaction t, int abort)
-{
- char abortstr[2];
- int error;
-
- if (abort)
- strcpy(abortstr, "F");
- else
- strcpy(abortstr, "T");
-
- error = xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL);
-
- sx_sunlock(&xs_state.suspend_mutex);
-
- return (error);
-}
-
-/* Single read and scanf: returns zero or errno. */
-int
-xenbus_scanf(struct xenbus_transaction t,
- const char *dir, const char *node, int *scancountp, const char *fmt, ...)
-{
- va_list ap;
- int error, ns;
- char *val;
-
- error = xenbus_read(t, dir, node, NULL, (void **) &val);
- if (error)
- return (error);
-
- va_start(ap, fmt);
- ns = vsscanf(val, fmt, ap);
- va_end(ap);
- free(val, M_DEVBUF);
- /* Distinctive errno. */
- if (ns == 0)
- return (ERANGE);
- if (scancountp)
- *scancountp = ns;
- return (0);
-}
-
-/* Single printf and write: returns zero or errno. */
-int
-xenbus_printf(struct xenbus_transaction t,
- const char *dir, const char *node, const char *fmt, ...)
-{
- va_list ap;
- int error, ret;
-#define PRINTF_BUFFER_SIZE 4096
- char *printf_buffer;
-
- printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK);
-
- va_start(ap, fmt);
- ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
- va_end(ap);
-
- KASSERT(ret <= PRINTF_BUFFER_SIZE-1, ("xenbus_printf: message too large"));
- error = xenbus_write(t, dir, node, printf_buffer);
-
- free(printf_buffer, M_DEVBUF);
-
- return (error);
-}
-
-/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
-int
-xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
-{
- va_list ap;
- const char *name;
- int error, i;
-
- for (i = 0; i < 10000; i++)
- HYPERVISOR_yield();
-
- va_start(ap, dir);
- error = 0;
- while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
- const char *fmt = va_arg(ap, char *);
- void *result = va_arg(ap, void *);
- char *p;
-
- error = xenbus_read(t, dir, name, NULL, (void **) &p);
- if (error)
- break;
-
- if (fmt) {
- if (sscanf(p, fmt, result) == 0)
- error = EINVAL;
- free(p, M_DEVBUF);
- } else
- *(char **)result = p;
- }
- va_end(ap);
-
- return (error);
-}
-
-static int
-xs_watch(const char *path, const char *token)
-{
- struct iovec iov[2];
-
- iov[0].iov_base = (void *)(uintptr_t) path;
- iov[0].iov_len = strlen(path) + 1;
- iov[1].iov_base = (void *)(uintptr_t) token;
- iov[1].iov_len = strlen(token) + 1;
-
- return (xs_talkv(XBT_NIL, XS_WATCH, iov, 2, NULL, NULL));
-}
-
-static int
-xs_unwatch(const char *path, const char *token)
-{
- struct iovec iov[2];
-
- iov[0].iov_base = (void *)(uintptr_t) path;
- iov[0].iov_len = strlen(path) + 1;
- iov[1].iov_base = (void *)(uintptr_t) token;
- iov[1].iov_len = strlen(token) + 1;
-
- return (xs_talkv(XBT_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
-}
-
-static struct xenbus_watch *
-find_watch(const char *token)
-{
- struct xenbus_watch *i, *cmp;
-
- cmp = (void *)strtoul(token, NULL, 16);
-
- LIST_FOREACH(i, &watches, list)
- if (i == cmp)
- return (i);
-
- return (NULL);
-}
-
-/* Register callback to watch this node. */
-int
-register_xenbus_watch(struct xenbus_watch *watch)
-{
- /* Pointer in ascii is the token. */
- char token[sizeof(watch) * 2 + 1];
- int error;
-
- sprintf(token, "%lX", (long)watch);
-
- sx_slock(&xs_state.suspend_mutex);
-
- mtx_lock(&watches_lock);
- KASSERT(find_watch(token) == NULL, ("watch already registered"));
- LIST_INSERT_HEAD(&watches, watch, list);
- mtx_unlock(&watches_lock);
-
- error = xs_watch(watch->node, token);
-
- /* Ignore errors due to multiple registration. */
- if (error == EEXIST) {
- mtx_lock(&watches_lock);
- LIST_REMOVE(watch, list);
- mtx_unlock(&watches_lock);
- }
-
- sx_sunlock(&xs_state.suspend_mutex);
-
- return (error);
-}
-
-void
-unregister_xenbus_watch(struct xenbus_watch *watch)
-{
- struct xs_stored_msg *msg, *tmp;
- char token[sizeof(watch) * 2 + 1];
- int error;
-
- sprintf(token, "%lX", (long)watch);
-
- sx_slock(&xs_state.suspend_mutex);
-
- mtx_lock(&watches_lock);
- KASSERT(find_watch(token), ("watch not registered"));
- LIST_REMOVE(watch, list);
- mtx_unlock(&watches_lock);
-
- error = xs_unwatch(watch->node, token);
- if (error)
- log(LOG_WARNING, "XENBUS Failed to release watch %s: %i\n",
- watch->node, error);
-
- sx_sunlock(&xs_state.suspend_mutex);
-
- /* Cancel pending watch events. */
- mtx_lock(&watch_events_lock);
- TAILQ_FOREACH_SAFE(msg, &watch_events, list, tmp) {
- if (msg->u.watch.handle != watch)
- continue;
- TAILQ_REMOVE(&watch_events, msg, list);
- free(msg->u.watch.vec, M_DEVBUF);
- free(msg, M_DEVBUF);
- }
- mtx_unlock(&watch_events_lock);
-
- /* Flush any currently-executing callback, unless we are it. :-) */
- if (curproc->p_pid != xenwatch_pid) {
- sx_xlock(&xenwatch_mutex);
- sx_xunlock(&xenwatch_mutex);
- }
-}
-
-void
-xs_suspend(void)
-{
-
- sx_xlock(&xs_state.suspend_mutex);
- sx_xlock(&xs_state.request_mutex);
-}
-
-void
-xs_resume(void)
-{
- struct xenbus_watch *watch;
- char token[sizeof(watch) * 2 + 1];
-
- sx_xunlock(&xs_state.request_mutex);
-
- /* No need for watches_lock: the suspend_mutex is sufficient. */
- LIST_FOREACH(watch, &watches, list) {
- sprintf(token, "%lX", (long)watch);
- xs_watch(watch->node, token);
- }
-
- sx_xunlock(&xs_state.suspend_mutex);
-}
-
-static void
-xenwatch_thread(void *unused)
-{
- struct xs_stored_msg *msg;
-
- for (;;) {
-
- mtx_lock(&watch_events_lock);
- while (TAILQ_EMPTY(&watch_events))
- mtx_sleep(&watch_events_waitq,
- &watch_events_lock,
- PWAIT | PCATCH, "waitev", hz/10);
-
- mtx_unlock(&watch_events_lock);
- sx_xlock(&xenwatch_mutex);
-
- mtx_lock(&watch_events_lock);
- msg = TAILQ_FIRST(&watch_events);
- if (msg)
- TAILQ_REMOVE(&watch_events, msg, list);
- mtx_unlock(&watch_events_lock);
-
- if (msg != NULL) {
- /*
- * XXX There are messages coming in with a NULL callback.
- * XXX This deserves further investigation; the workaround
- * XXX here simply prevents the kernel from panic'ing
- * XXX on startup.
- */
- if (msg->u.watch.handle->callback != NULL)
- msg->u.watch.handle->callback(
- msg->u.watch.handle,
- (const char **)msg->u.watch.vec,
- msg->u.watch.vec_size);
- free(msg->u.watch.vec, M_DEVBUF);
- free(msg, M_DEVBUF);
- }
-
- sx_xunlock(&xenwatch_mutex);
- }
-}
-
-static int
-xs_process_msg(enum xsd_sockmsg_type *type)
-{
- struct xs_stored_msg *msg;
- char *body;
- int error;
-
- msg = malloc(sizeof(*msg), M_DEVBUF, M_WAITOK);
- mtx_lock(&xs_state.reply_lock);
- error = xb_read(&msg->hdr, sizeof(msg->hdr),
- &xs_state.reply_lock.lock_object);
- mtx_unlock(&xs_state.reply_lock);
- if (error) {
- free(msg, M_DEVBUF);
- return (error);
- }
-
- body = malloc(msg->hdr.len + 1, M_DEVBUF, M_WAITOK);
- mtx_lock(&xs_state.reply_lock);
- error = xb_read(body, msg->hdr.len,
- &xs_state.reply_lock.lock_object);
- mtx_unlock(&xs_state.reply_lock);
- if (error) {
- free(body, M_DEVBUF);
- free(msg, M_DEVBUF);
- return (error);
- }
- body[msg->hdr.len] = '\0';
-
- *type = msg->hdr.type;
- if (msg->hdr.type == XS_WATCH_EVENT) {
- msg->u.watch.vec = split(body, msg->hdr.len,
- &msg->u.watch.vec_size);
-
- mtx_lock(&watches_lock);
- msg->u.watch.handle = find_watch(
- msg->u.watch.vec[XS_WATCH_TOKEN]);
- if (msg->u.watch.handle != NULL) {
- mtx_lock(&watch_events_lock);
- TAILQ_INSERT_TAIL(&watch_events, msg, list);
- wakeup(&watch_events_waitq);
- mtx_unlock(&watch_events_lock);
- } else {
- free(msg->u.watch.vec, M_DEVBUF);
- free(msg, M_DEVBUF);
- }
- mtx_unlock(&watches_lock);
- } else {
- msg->u.reply.body = body;
- mtx_lock(&xs_state.reply_lock);
- TAILQ_INSERT_TAIL(&xs_state.reply_list, msg, list);
- wakeup(&xs_state.reply_waitq);
- mtx_unlock(&xs_state.reply_lock);
- }
-
- return 0;
-}
-
-static void
-xenbus_thread(void *unused)
-{
- int error;
- enum xsd_sockmsg_type type;
- xenbus_running = 1;
-
- for (;;) {
- error = xs_process_msg(&type);
- if (error)
- printf("XENBUS error %d while reading message\n",
- error);
- }
-}
-
-#ifdef XENHVM
-static unsigned long xen_store_mfn;
-char *xen_store;
-
-static inline unsigned long
-hvm_get_parameter(int index)
-{
- struct xen_hvm_param xhv;
- int error;
-
- xhv.domid = DOMID_SELF;
- xhv.index = index;
- error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
- if (error) {
- printf("hvm_get_parameter: failed to get %d, error %d\n",
- index, error);
- return (0);
- }
- return (xhv.value);
-}
-
-#endif
-
-int
-xs_init(void)
-{
- int error;
- struct proc *p;
-
-#ifdef XENHVM
- xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
- xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
- xen_store = pmap_mapdev(xen_store_mfn * PAGE_SIZE, PAGE_SIZE);
-#else
- xen_store_evtchn = xen_start_info->store_evtchn;
-#endif
-
- TAILQ_INIT(&xs_state.reply_list);
- TAILQ_INIT(&watch_events);
- sx_init(&xenwatch_mutex, "xenwatch");
-
-
- mtx_init(&xs_state.reply_lock, "state reply", NULL, MTX_DEF);
- sx_init(&xs_state.request_mutex, "xenstore request");
- sx_init(&xs_state.suspend_mutex, "xenstore suspend");
-
-
-#if 0
- mtx_init(&xs_state.suspend_mutex, "xenstore suspend", NULL, MTX_DEF);
- sema_init(&xs_state.request_mutex, 1, "xenstore request");
- sema_init(&xenwatch_mutex, 1, "xenwatch");
-#endif
- mtx_init(&watches_lock, "watches", NULL, MTX_DEF);
- mtx_init(&watch_events_lock, "watch events", NULL, MTX_DEF);
-
- /* Initialize the shared memory rings to talk to xenstored */
- error = xb_init_comms();
- if (error)
- return (error);
-
- xenwatch_running = 1;
- error = kproc_create(xenwatch_thread, NULL, &p,
- RFHIGHPID, 0, "xenwatch");
- if (error)
- return (error);
- xenwatch_pid = p->p_pid;
-
- error = kproc_create(xenbus_thread, NULL, NULL,
- RFHIGHPID, 0, "xenbus");
-
- return (error);
-}
diff --git a/sys/xen/xenbus/xenbusb.c b/sys/xen/xenbus/xenbusb.c
new file mode 100644
index 0000000..49facb6
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb.c
@@ -0,0 +1,878 @@
+/******************************************************************************
+ * Copyright (C) 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb.c
+ *
+ * \brief Shared support functions for managing the NewBus busses that contain
+ * Xen front and back end device instances.
+ *
+ * The NewBus implementation of XenBus attaches a xenbusb_front and xenbusb_back
+ * child bus to the xenstore device. This strategy allows the small differences
+ * in the handling of XenBus operations for front and back devices to be handled
+ * as overrides in xenbusb_front/back.c. Front and back specific device
+ * classes are also provided so device drivers can register for the devices they
+ * can handle without the need to filter within their probe routines. The
+ * net result is a device hierarchy that might look like this:
+ *
+ * xenstore0/
+ * xenbusb_front0/
+ * xn0
+ * xbd0
+ * xbd1
+ * xenbusb_back0/
+ * xbbd0
+ * xnb0
+ * xnb1
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenbus/xenbusb.h>
+#include <xen/xenbus/xenbusvar.h>
+
+/*------------------------- Private Functions --------------------------------*/
+/**
+ * \brief Deallocate XenBus device instance variables.
+ *
+ * \param ivars The instance variable block to free.
+ */
+static void
+xenbusb_free_child_ivars(struct xenbus_device_ivars *ivars)
+{
+ if (ivars->xd_otherend_watch.node != NULL) {
+ xs_unregister_watch(&ivars->xd_otherend_watch);
+ free(ivars->xd_otherend_watch.node, M_XENBUS);
+ ivars->xd_otherend_watch.node = NULL;
+ }
+
+ if (ivars->xd_node != NULL) {
+ free(ivars->xd_node, M_XENBUS);
+ ivars->xd_node = NULL;
+ }
+
+ if (ivars->xd_type != NULL) {
+ free(ivars->xd_type, M_XENBUS);
+ ivars->xd_type = NULL;
+ }
+
+ if (ivars->xd_otherend_path != NULL) {
+ free(ivars->xd_otherend_path, M_XENBUS);
+ ivars->xd_otherend_path = NULL;
+ }
+
+ free(ivars, M_XENBUS);
+}
+
+/**
+ * XenBus watch callback registered against the "state" XenStore
+ * node of the other-end of a split device connection.
+ *
+ * This callback is invoked whenever the state of a device instance's
+ * peer changes.
+ *
+ * \param watch The xs_watch object used to register this callback
+ * function.
+ * \param vec An array of pointers to NUL terminated strings containing
+ * watch event data. The vector should be indexed via the
+ * xs_watch_type enum in xs_wire.h.
+ * \param vec_size The number of elements in vec.
+ *
+ * \return The device_t of the found device if any, or NULL.
+ *
+ * \note device_t is a pointer type, so it can be compared against
+ * NULL for validity.
+ */
+static void
+xenbusb_otherend_changed(struct xs_watch *watch, const char **vec,
+ unsigned int vec_size __unused)
+{
+ struct xenbus_device_ivars *ivars;
+ device_t dev;
+ enum xenbus_state newstate;
+
+ ivars = (struct xenbus_device_ivars *) watch;
+ dev = ivars->xd_dev;
+
+ if (!ivars->xd_otherend_path
+ || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH],
+ strlen(ivars->xd_otherend_path)))
+ return;
+
+ newstate = xenbus_read_driver_state(ivars->xd_otherend_path);
+ XENBUS_OTHEREND_CHANGED(dev, newstate);
+}
+
+/**
+ * Search our internal record of configured devices (not the XenStore)
+ * to determine if the XenBus device indicated by \a node is known to
+ * the system.
+ *
+ * \param dev The XenBus bus instance to search for device children.
+ * \param node The XenStore node path for the device to find.
+ *
+ * \return The device_t of the found device if any, or NULL.
+ *
+ * \note device_t is a pointer type, so it can be compared against
+ * NULL for validity.
+ */
+static device_t
+xenbusb_device_exists(device_t dev, const char *node)
+{
+ device_t *kids;
+ device_t result;
+ struct xenbus_device_ivars *ivars;
+ int i, count;
+
+ if (device_get_children(dev, &kids, &count))
+ return (FALSE);
+
+ result = NULL;
+ for (i = 0; i < count; i++) {
+ ivars = device_get_ivars(kids[i]);
+ if (!strcmp(ivars->xd_node, node)) {
+ result = kids[i];
+ break;
+ }
+ }
+ free(kids, M_TEMP);
+
+ return (result);
+}
+
+static void
+xenbusb_delete_child(device_t dev, device_t child)
+{
+ struct xenbus_device_ivars *ivars;
+
+ ivars = device_get_ivars(child);
+
+ /*
+ * We no longer care about the otherend of the
+ * connection. Cancel the watch now so that we
+ * don't try to handle an event for a partially
+ * detached child.
+ */
+ if (ivars->xd_otherend_watch.node != NULL)
+ xs_unregister_watch(&ivars->xd_otherend_watch);
+
+ device_delete_child(dev, child);
+ xenbusb_free_child_ivars(ivars);
+}
+
+/**
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param child The NewBus device representing a child of dev%'s XenBus bus.
+ */
+static void
+xenbusb_verify_device(device_t dev, device_t child)
+{
+ if (xs_exists(XST_NIL, xenbus_get_node(child), "") == 0) {
+
+ /*
+ * Device tree has been removed from Xenbus.
+ * Tear down the device.
+ */
+ xenbusb_delete_child(dev, child);
+ }
+}
+
+/**
+ * \brief Enumerate the devices on a XenBus bus and register them with
+ * the NewBus device tree.
+ *
+ * xenbusb_enumerate_bus() will create entries (in state DS_NOTPRESENT)
+ * for nodes that appear in the XenStore, but will not invoke probe/attach
+ * operations on drivers. Probe/Attach processing must be separately
+ * performed via an invocation of xenbusb_probe_children(). This is usually
+ * done via the xbs_probe_children task.
+ *
+ * \param xbs XenBus Bus device softc of the owner of the bus to enumerate.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xenbusb_enumerate_bus(struct xenbusb_softc *xbs)
+{
+ const char **types;
+ u_int type_idx;
+ u_int type_count;
+ int error;
+
+ error = xs_directory(XST_NIL, xbs->xbs_node, "", &type_count, &types);
+ if (error)
+ return (error);
+
+ for (type_idx = 0; type_idx < type_count; type_idx++)
+ XENBUSB_ENUMERATE_TYPE(xbs->xbs_dev, types[type_idx]);
+
+ free(types, M_XENSTORE);
+
+ return (0);
+}
+
+/**
+ * Handler for all generic XenBus device systcl nodes.
+ */
+static int
+xenbusb_device_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ device_t dev;
+ const char *value;
+
+ dev = (device_t)arg1;
+ switch (arg2) {
+ case XENBUS_IVAR_NODE:
+ value = xenbus_get_node(dev);
+ break;
+ case XENBUS_IVAR_TYPE:
+ value = xenbus_get_type(dev);
+ break;
+ case XENBUS_IVAR_STATE:
+ value = xenbus_strstate(xenbus_get_state(dev));
+ break;
+ case XENBUS_IVAR_OTHEREND_ID:
+ return (sysctl_handle_int(oidp, NULL,
+ xenbus_get_otherend_id(dev),
+ req));
+ /* NOTREACHED */
+ case XENBUS_IVAR_OTHEREND_PATH:
+ value = xenbus_get_otherend_path(dev);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (SYSCTL_OUT(req, value, strlen(value)));
+}
+
+/**
+ * Create read-only systcl nodes for xenbusb device ivar data.
+ *
+ * \param dev The XenBus device instance to register with sysctl.
+ */
+static void
+xenbusb_device_sysctl_init(device_t dev)
+{
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *tree;
+
+ ctx = device_get_sysctl_ctx(dev);
+ tree = device_get_sysctl_tree(dev);
+
+ SYSCTL_ADD_PROC(ctx,
+ SYSCTL_CHILDREN(tree),
+ OID_AUTO,
+ "xenstore_path",
+ CTLFLAG_RD,
+ dev,
+ XENBUS_IVAR_NODE,
+ xenbusb_device_sysctl_handler,
+ "A",
+ "XenStore path to device");
+
+ SYSCTL_ADD_PROC(ctx,
+ SYSCTL_CHILDREN(tree),
+ OID_AUTO,
+ "xenbus_dev_type",
+ CTLFLAG_RD,
+ dev,
+ XENBUS_IVAR_TYPE,
+ xenbusb_device_sysctl_handler,
+ "A",
+ "XenBus device type");
+
+ SYSCTL_ADD_PROC(ctx,
+ SYSCTL_CHILDREN(tree),
+ OID_AUTO,
+ "xenbus_connection_state",
+ CTLFLAG_RD,
+ dev,
+ XENBUS_IVAR_STATE,
+ xenbusb_device_sysctl_handler,
+ "A",
+ "XenBus state of peer connection");
+
+ SYSCTL_ADD_PROC(ctx,
+ SYSCTL_CHILDREN(tree),
+ OID_AUTO,
+ "xenbus_peer_domid",
+ CTLFLAG_RD,
+ dev,
+ XENBUS_IVAR_OTHEREND_ID,
+ xenbusb_device_sysctl_handler,
+ "I",
+ "Xen domain ID of peer");
+
+ SYSCTL_ADD_PROC(ctx,
+ SYSCTL_CHILDREN(tree),
+ OID_AUTO,
+ "xenstore_peer_path",
+ CTLFLAG_RD,
+ dev,
+ XENBUS_IVAR_OTHEREND_PATH,
+ xenbusb_device_sysctl_handler,
+ "A",
+ "XenStore path to peer device");
+}
+
+/**
+ * \brief Verify the existance of attached device instances and perform
+ * probe/attach processing for newly arrived devices.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xenbusb_probe_children(device_t dev)
+{
+ device_t *kids;
+ struct xenbus_device_ivars *ivars;
+ int i, count;
+
+ if (device_get_children(dev, &kids, &count) == 0) {
+ for (i = 0; i < count; i++) {
+ if (device_get_state(kids[i]) != DS_NOTPRESENT) {
+ /*
+ * We already know about this one.
+ * Make sure it's still here.
+ */
+ xenbusb_verify_device(dev, kids[i]);
+ continue;
+ }
+
+ if (device_probe_and_attach(kids[i])) {
+ /*
+ * Transition device to the closed state
+ * so the world knows that attachment will
+ * not occur.
+ */
+ xenbus_set_state(kids[i], XenbusStateClosed);
+
+ /*
+ * Remove our record of this device.
+ * So long as it remains in the closed
+ * state in the XenStore, we will not find
+ * it again. The state will only change
+ * if the control domain actively reconfigures
+ * this device.
+ */
+ xenbusb_delete_child(dev, kids[i]);
+
+ continue;
+ }
+ /*
+ * Augment default newbus provided dynamic sysctl
+ * variables with the standard ivar contents of
+ * XenBus devices.
+ */
+ xenbusb_device_sysctl_init(kids[i]);
+
+ /*
+ * Now that we have a driver managing this device
+ * that can receive otherend state change events,
+ * hook up a watch for them.
+ */
+ ivars = device_get_ivars(kids[i]);
+ xs_register_watch(&ivars->xd_otherend_watch);
+ }
+ free(kids, M_TEMP);
+ }
+
+ return (0);
+}
+
+/**
+ * \brief Task callback function to perform XenBus probe operations
+ * from a known safe context.
+ *
+ * \param arg The NewBus device_t representing the bus instance to
+ * on which to perform probe processing.
+ * \param pending The number of times this task was queued before it could
+ * be run.
+ */
+static void
+xenbusb_probe_children_cb(void *arg, int pending __unused)
+{
+ device_t dev = (device_t)arg;
+
+ /*
+ * Hold Giant until the Giant free newbus changes are committed.
+ */
+ mtx_lock(&Giant);
+ xenbusb_probe_children(dev);
+ mtx_unlock(&Giant);
+}
+
+/**
+ * \brief XenStore watch callback for the root node of the XenStore
+ * subtree representing a XenBus.
+ *
+ * This callback performs, or delegates to the xbs_probe_children task,
+ * all processing necessary to handle dynmaic device arrival and departure
+ * events from a XenBus.
+ *
+ * \param watch The XenStore watch object associated with this callback.
+ * \param vec The XenStore watch event data.
+ * \param len The number of fields in the event data stream.
+ */
+static void
+xenbusb_devices_changed(struct xs_watch *watch, const char **vec,
+ unsigned int len)
+{
+ struct xenbusb_softc *xbs;
+ device_t dev;
+ char *node;
+ char *bus;
+ char *type;
+ char *id;
+ char *p;
+ u_int component;
+
+ xbs = (struct xenbusb_softc *)watch;
+ dev = xbs->xbs_dev;
+
+ if (len <= XS_WATCH_PATH) {
+ device_printf(dev, "xenbusb_devices_changed: "
+ "Short Event Data.\n");
+ return;
+ }
+
+ node = strdup(vec[XS_WATCH_PATH], M_XENBUS);
+ p = strchr(node, '/');
+ if (p == NULL)
+ goto out;
+ bus = node;
+ *p = 0;
+ type = p + 1;
+
+ p = strchr(type, '/');
+ if (p == NULL)
+ goto out;
+ *p++ = 0;
+
+ /*
+ * Extract the device ID. A device ID has one or more path
+ * components separated by the '/' character.
+ *
+ * e.g. "<frontend vm id>/<frontend dev id>" for backend devices.
+ */
+ id = p;
+ for (component = 0; component < xbs->xbs_id_components; component++) {
+ p = strchr(p, '/');
+ if (p == NULL)
+ break;
+ p++;
+ }
+ if (p != NULL)
+ *p = 0;
+
+ if (*id != 0 && component >= xbs->xbs_id_components - 1) {
+ xenbusb_add_device(xbs->xbs_dev, type, id);
+ taskqueue_enqueue(taskqueue_thread, &xbs->xbs_probe_children);
+ }
+out:
+ free(node, M_XENBUS);
+}
+
+/**
+ * \brief Interrupt configuration hook callback associated with xbs_attch_ch.
+ *
+ * Since interrupts are always functional at the time of XenBus configuration,
+ * there is nothing to be done when the callback occurs. This hook is only
+ * registered to hold up boot processing while XenBus devices come online.
+ *
+ * \param arg Unused configuration hook callback argument.
+ */
+static void
+xenbusb_nop_confighook_cb(void *arg __unused)
+{
+}
+
+/**
+ * \brief Decrement the number of XenBus child devices in the
+ * connecting state by one and release the xbs_attch_ch
+ * interrupt configuration hook if the connecting count
+ * drops to zero.
+ *
+ * \param xbs XenBus Bus device softc of the owner of the bus to enumerate.
+ */
+static void
+xenbusb_release_confighook(struct xenbusb_softc *xbs)
+{
+ mtx_lock(&xbs->xbs_lock);
+ KASSERT(xbs->xbs_connecting_children > 0,
+ ("Connecting device count error\n"));
+ xbs->xbs_connecting_children--;
+ if (xbs->xbs_connecting_children == 0
+ && (xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) {
+ xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE;
+ mtx_unlock(&xbs->xbs_lock);
+ config_intrhook_disestablish(&xbs->xbs_attach_ch);
+ } else {
+ mtx_unlock(&xbs->xbs_lock);
+ }
+}
+
+/*--------------------------- Public Functions -------------------------------*/
+/*--------- API comments for these methods can be found in xenbusb.h ---------*/
+void
+xenbusb_identify(driver_t *driver __unused, device_t parent)
+{
+ /*
+ * A single instance of each bus type for which we have a driver
+ * is always present in a system operating under Xen.
+ */
+ BUS_ADD_CHILD(parent, 0, driver->name, 0);
+}
+
+int
+xenbusb_add_device(device_t dev, const char *type, const char *id)
+{
+ struct xenbusb_softc *xbs;
+ struct sbuf *devpath_sbuf;
+ char *devpath;
+ struct xenbus_device_ivars *ivars;
+ int error;
+
+ xbs = device_get_softc(dev);
+ devpath_sbuf = sbuf_new_auto();
+ sbuf_printf(devpath_sbuf, "%s/%s/%s", xbs->xbs_node, type, id);
+ sbuf_finish(devpath_sbuf);
+ devpath = sbuf_data(devpath_sbuf);
+
+ ivars = malloc(sizeof(*ivars), M_XENBUS, M_ZERO|M_WAITOK);
+ error = ENXIO;
+
+ if (xs_exists(XST_NIL, devpath, "") != 0) {
+ device_t child;
+ enum xenbus_state state;
+ char *statepath;
+
+ child = xenbusb_device_exists(dev, devpath);
+ if (child != NULL) {
+ /*
+ * We are already tracking this node
+ */
+ error = 0;
+ goto out;
+ }
+
+ state = xenbus_read_driver_state(devpath);
+ if (state != XenbusStateInitialising) {
+ /*
+ * Device is not new, so ignore it. This can
+ * happen if a device is going away after
+ * switching to Closed.
+ */
+ printf("xenbusb_add_device: Device %s ignored. "
+ "State %d\n", devpath, state);
+ error = 0;
+ goto out;
+ }
+
+ sx_init(&ivars->xd_lock, "xdlock");
+ ivars->xd_flags = XDF_CONNECTING;
+ ivars->xd_node = strdup(devpath, M_XENBUS);
+ ivars->xd_type = strdup(type, M_XENBUS);
+ ivars->xd_state = XenbusStateInitialising;
+
+ error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
+ if (error) {
+ printf("xenbus_update_device: %s no otherend id\n",
+ devpath);
+ goto out;
+ }
+
+ statepath = malloc(strlen(ivars->xd_otherend_path)
+ + strlen("/state") + 1, M_XENBUS, M_WAITOK);
+ sprintf(statepath, "%s/state", ivars->xd_otherend_path);
+
+ ivars->xd_otherend_watch.node = statepath;
+ ivars->xd_otherend_watch.callback = xenbusb_otherend_changed;
+
+ mtx_lock(&xbs->xbs_lock);
+ xbs->xbs_connecting_children++;
+ mtx_unlock(&xbs->xbs_lock);
+
+ child = device_add_child(dev, NULL, -1);
+ ivars->xd_dev = child;
+ device_set_ivars(child, ivars);
+ }
+
+out:
+ sbuf_delete(devpath_sbuf);
+ if (error != 0)
+ xenbusb_free_child_ivars(ivars);
+
+ return (error);
+}
+
+int
+xenbusb_attach(device_t dev, char *bus_node, u_int id_components)
+{
+ struct xenbusb_softc *xbs;
+
+ xbs = device_get_softc(dev);
+ mtx_init(&xbs->xbs_lock, "xenbusb softc lock", NULL, MTX_DEF);
+ xbs->xbs_node = bus_node;
+ xbs->xbs_id_components = id_components;
+ xbs->xbs_dev = dev;
+
+ /*
+ * Since XenBus busses are attached to the XenStore, and
+ * the XenStore does not probe children until after interrupt
+ * services are available, this config hook is used solely
+ * to ensure that the remainder of the boot process (e.g.
+ * mount root) is deferred until child devices are adequately
+ * probed. We unblock the boot process as soon as the
+ * connecting child count in our softc goes to 0.
+ */
+ xbs->xbs_attach_ch.ich_func = xenbusb_nop_confighook_cb;
+ xbs->xbs_attach_ch.ich_arg = dev;
+ config_intrhook_establish(&xbs->xbs_attach_ch);
+ xbs->xbs_flags |= XBS_ATTACH_CH_ACTIVE;
+ xbs->xbs_connecting_children = 1;
+
+ /*
+ * The subtree for this bus type may not yet exist
+ * causing initial enumeration to fail. We still
+ * want to return success from our attach though
+ * so that we are ready to handle devices for this
+ * bus when they are dynamically attached to us
+ * by a Xen management action.
+ */
+ (void)xenbusb_enumerate_bus(xbs);
+ xenbusb_probe_children(dev);
+
+ xbs->xbs_device_watch.node = bus_node;
+ xbs->xbs_device_watch.callback = xenbusb_devices_changed;
+
+ TASK_INIT(&xbs->xbs_probe_children, 0, xenbusb_probe_children_cb, dev);
+
+ xs_register_watch(&xbs->xbs_device_watch);
+
+ xenbusb_release_confighook(xbs);
+
+ return (0);
+}
+
+int
+xenbusb_resume(device_t dev)
+{
+ device_t *kids;
+ struct xenbus_device_ivars *ivars;
+ int i, count, error;
+ char *statepath;
+
+ /*
+ * We must re-examine each device and find the new path for
+ * its backend.
+ */
+ if (device_get_children(dev, &kids, &count) == 0) {
+ for (i = 0; i < count; i++) {
+ if (device_get_state(kids[i]) == DS_NOTPRESENT)
+ continue;
+
+ ivars = device_get_ivars(kids[i]);
+
+ xs_unregister_watch(&ivars->xd_otherend_watch);
+ ivars->xd_state = XenbusStateInitialising;
+
+ /*
+ * Find the new backend details and
+ * re-register our watch.
+ */
+ error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
+ if (error)
+ return (error);
+
+ DEVICE_RESUME(kids[i]);
+
+ statepath = malloc(strlen(ivars->xd_otherend_path)
+ + strlen("/state") + 1, M_XENBUS, M_WAITOK);
+ sprintf(statepath, "%s/state", ivars->xd_otherend_path);
+
+ free(ivars->xd_otherend_watch.node, M_XENBUS);
+ ivars->xd_otherend_watch.node = statepath;
+ xs_register_watch(&ivars->xd_otherend_watch);
+
+#if 0
+ /*
+ * Can't do this yet since we are running in
+ * the xenwatch thread and if we sleep here,
+ * we will stop delivering watch notifications
+ * and the device will never come back online.
+ */
+ sx_xlock(&ivars->xd_lock);
+ while (ivars->xd_state != XenbusStateClosed
+ && ivars->xd_state != XenbusStateConnected)
+ sx_sleep(&ivars->xd_state, &ivars->xd_lock,
+ 0, "xdresume", 0);
+ sx_xunlock(&ivars->xd_lock);
+#endif
+ }
+ free(kids, M_TEMP);
+ }
+
+ return (0);
+}
+
+int
+xenbusb_print_child(device_t dev, device_t child)
+{
+ struct xenbus_device_ivars *ivars = device_get_ivars(child);
+ int retval = 0;
+
+ retval += bus_print_child_header(dev, child);
+ retval += printf(" at %s", ivars->xd_node);
+ retval += bus_print_child_footer(dev, child);
+
+ return (retval);
+}
+
+int
+xenbusb_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+ struct xenbus_device_ivars *ivars = device_get_ivars(child);
+
+ switch (index) {
+ case XENBUS_IVAR_NODE:
+ *result = (uintptr_t) ivars->xd_node;
+ return (0);
+
+ case XENBUS_IVAR_TYPE:
+ *result = (uintptr_t) ivars->xd_type;
+ return (0);
+
+ case XENBUS_IVAR_STATE:
+ *result = (uintptr_t) ivars->xd_state;
+ return (0);
+
+ case XENBUS_IVAR_OTHEREND_ID:
+ *result = (uintptr_t) ivars->xd_otherend_id;
+ return (0);
+
+ case XENBUS_IVAR_OTHEREND_PATH:
+ *result = (uintptr_t) ivars->xd_otherend_path;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+int
+xenbusb_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
+{
+ struct xenbus_device_ivars *ivars = device_get_ivars(child);
+ enum xenbus_state newstate;
+ int currstate;
+
+ switch (index) {
+ case XENBUS_IVAR_STATE:
+ {
+ int error;
+
+ newstate = (enum xenbus_state) value;
+ sx_xlock(&ivars->xd_lock);
+ if (ivars->xd_state == newstate) {
+ error = 0;
+ goto out;
+ }
+
+ error = xs_scanf(XST_NIL, ivars->xd_node, "state",
+ NULL, "%d", &currstate);
+ if (error)
+ goto out;
+
+ do {
+ error = xs_printf(XST_NIL, ivars->xd_node, "state",
+ "%d", newstate);
+ } while (error == EAGAIN);
+ if (error) {
+ /*
+ * Avoid looping through xenbus_dev_fatal()
+ * which calls xenbus_write_ivar to set the
+ * state to closing.
+ */
+ if (newstate != XenbusStateClosing)
+ xenbus_dev_fatal(dev, error,
+ "writing new state");
+ goto out;
+ }
+ ivars->xd_state = newstate;
+
+ if ((ivars->xd_flags & XDF_CONNECTING) != 0
+ && (newstate == XenbusStateClosed
+ || newstate == XenbusStateConnected)) {
+ struct xenbusb_softc *xbs;
+
+ ivars->xd_flags &= ~XDF_CONNECTING;
+ xbs = device_get_softc(dev);
+ xenbusb_release_confighook(xbs);
+ }
+
+ wakeup(&ivars->xd_state);
+ out:
+ sx_xunlock(&ivars->xd_lock);
+ return (error);
+ }
+
+ case XENBUS_IVAR_NODE:
+ case XENBUS_IVAR_TYPE:
+ case XENBUS_IVAR_OTHEREND_ID:
+ case XENBUS_IVAR_OTHEREND_PATH:
+ /*
+ * These variables are read-only.
+ */
+ return (EINVAL);
+ }
+
+ return (ENOENT);
+}
diff --git a/sys/xen/xenbus/xenbusb.h b/sys/xen/xenbus/xenbusb.h
new file mode 100644
index 0000000..75abb98
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb.h
@@ -0,0 +1,272 @@
+/*-
+ * Core definitions and data structures shareable across OS platforms.
+ *
+ * Copyright (c) 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * substantially similar to the "NO WARRANTY" disclaimer below
+ * ("Disclaimer") and any redistribution must be conditioned upon
+ * including a substantially similar Disclaimer requirement for further
+ * binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+#ifndef _XEN_XENBUS_XENBUSB_H
+#define _XEN_XENBUS_XENBUSB_H
+
+/**
+ * \file xenbusb.h
+ *
+ * Datastructures and function declarations for use in implementing
+ * bus attachements (e.g. frontend and backend device busses) for XenBus.
+ */
+#include "xenbusb_if.h"
+
+/**
+ * Enumeration of state flag values for the xbs_flags field of
+ * the xenbusb_softc structure.
+ */
+typedef enum {
+ /** */
+ XBS_ATTACH_CH_ACTIVE = 0x01
+} xenbusb_softc_flag;
+
+/**
+ * \brief Container for all state needed to manage a Xenbus Bus
+ * attachment.
+ */
+struct xenbusb_softc {
+ /**
+ * XenStore watch used to monitor the subtree of the
+ * XenStore where devices for this bus attachment arrive
+ * and depart.
+ *
+ * \note This field must be the first in the softc structure
+ * so that a simple cast can be used to retrieve the
+ * softc from within a XenStore watch event callback.
+ */
+ struct xs_watch xbs_device_watch;
+
+ /** Mutex used to protect fields of the xenbusb_softc. */
+ struct mtx xbs_lock;
+
+ /** State flags. */
+ xenbusb_softc_flag xbs_flags;
+
+ /**
+ * A dedicated task for processing child arrival and
+ * departure events.
+ */
+ struct task xbs_probe_children;
+
+ /**
+ * Config Hook used to block boot processing until
+ * XenBus devices complete their connection processing
+ * with other VMs.
+ */
+ struct intr_config_hook xbs_attach_ch;
+
+ /**
+ * The number of children for this bus that are still
+ * in the connecting (to other VMs) state. This variable
+ * is used to determine when to release xbs_attach_ch.
+ */
+ u_int xbs_connecting_children;
+
+ /** The NewBus device_t for this bus attachment. */
+ device_t xbs_dev;
+
+ /**
+ * The VM relative path to the XenStore subtree this
+ * bus attachment manages.
+ */
+ const char *xbs_node;
+
+ /**
+ * The number of path components (strings separated by the '/'
+ * character) that make up the device ID on this bus.
+ */
+ u_int xbs_id_components;
+};
+
+/**
+ * Enumeration of state flag values for the xbs_flags field of
+ * the xenbusb_softc structure.
+ */
+typedef enum {
+
+ /**
+ * This device is contributing to the xbs_connecting_children
+ * count of its parent bus.
+ */
+ XDF_CONNECTING = 0x01
+} xenbus_dev_flag;
+
+/** Instance variables for devices on a XenBus bus. */
+struct xenbus_device_ivars {
+ /**
+ * XenStore watch used to monitor the subtree of the
+ * XenStore where information about the otherend of
+ * the split Xen device this device instance represents.
+ *
+ * \note This field must be the first in the instance
+ * variable structure so that a simple cast can be
+ * used to retrieve ivar data from within a XenStore
+ * watch event callback.
+ */
+ struct xs_watch xd_otherend_watch;
+
+ /** Sleepable lock used to protect instance data. */
+ struct sx xd_lock;
+
+ /** State flags. */
+ xenbus_dev_flag xd_flags;
+
+ /** The NewBus device_t for this XenBus device instance. */
+ device_t xd_dev;
+
+ /**
+ * The VM relative path to the XenStore subtree representing
+ * this VMs half of this device.
+ */
+ char *xd_node;
+
+ /** XenBus device type ("vbd", "vif", etc.). */
+ char *xd_type;
+
+ /**
+ * Cached version of <xd_node>/state node in the XenStore.
+ */
+ enum xenbus_state xd_state;
+
+ /** The VM identifier of the other end of this split device. */
+ int xd_otherend_id;
+
+ /**
+ * The path to the subtree of the XenStore where information
+ * about the otherend of this split device instance.
+ */
+ char *xd_otherend_path;
+};
+
+/**
+ * \brief Identify instances of this device type in the system.
+ *
+ * \param driver The driver performing this identify action.
+ * \param parent The NewBus parent device for any devices this method adds.
+ */
+void xenbusb_identify(driver_t *driver __unused, device_t parent);
+
+/**
+ * \brief Perform common XenBus bus attach processing.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param bus_node The XenStore path to the XenStore subtree for
+ * this XenBus bus.
+ * \param id_components The number of '/' separated path components that
+ * make up a unique device ID on this XenBus bus.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * Intiailizes the softc for this bus, installs an interrupt driven
+ * configuration hook to block boot processing until XenBus devices fully
+ * configure, performs an initial probe/attach of the bus, and registers
+ * a XenStore watch so we are notified when the bus topology changes.
+ */
+int xenbusb_attach(device_t dev, char *bus_node, u_int id_components);
+
+/**
+ * \brief Perform common XenBus bus resume handling.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xenbusb_resume(device_t dev);
+
+/**
+ * \brief Pretty-prints information about a child of a XenBus bus.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param child The NewBus device representing a child of dev%'s XenBus bus.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xenbusb_print_child(device_t dev, device_t child);
+
+/**
+ * \brief Common XenBus child instance variable read access method.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param child The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index The index of the instance variable to access.
+ * \param result The value of the instance variable accessed.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xenbusb_read_ivar(device_t dev, device_t child, int index,
+ uintptr_t *result);
+
+/**
+ * \brief Common XenBus child instance variable write access method.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param child The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index The index of the instance variable to access.
+ * \param value The new value to set in the instance variable accessed.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xenbusb_write_ivar(device_t dev, device_t child, int index,
+ uintptr_t value);
+
+/**
+ * \brief Attempt to add a XenBus device instance to this XenBus bus.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param type The device type being added (e.g. "vbd", "vif").
+ * \param id The device ID for this device.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure. Failure indicates that either the
+ * path to this device no longer exists or insufficient
+ * information exists in the XenStore to create a new
+ * device.
+ *
+ * If successful, this routine will add a device_t with instance
+ * variable storage to the NewBus device topology. Probe/Attach
+ * processing is not performed by this routine, but must be scheduled
+ * via the xbs_probe_children task. This separation of responsibilities
+ * is required to avoid hanging up the XenStore event delivery thread
+ * with our probe/attach work in the event a device is added via
+ * a callback from the XenStore.
+ */
+int xenbusb_add_device(device_t dev, const char *type, const char *id);
+
+#endif /* _XEN_XENBUS_XENBUSB_H */
diff --git a/sys/xen/xenbus/xenbusb_back.c b/sys/xen/xenbus/xenbusb_back.c
new file mode 100644
index 0000000..32bbc04
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_back.c
@@ -0,0 +1,295 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2009, 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb_back.c
+ *
+ * XenBus management of the NewBus bus containing the backend instances of
+ * Xen split devices.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenbus/xenbusb.h>
+
+
+/*------------------ Private Device Attachment Functions --------------------*/
+/**
+ * \brief Probe for the existance of the XenBus back bus.
+ *
+ * \param dev NewBus device_t for this XenBus back bus instance.
+ *
+ * \return Always returns 0 indicating success.
+ */
+static int
+xenbusb_back_probe(device_t dev)
+{
+ device_set_desc(dev, "Xen Backend Devices");
+
+ return (0);
+}
+
+/**
+ * \brief Attach the XenBus back bus.
+ *
+ * \param dev NewBus device_t for this XenBus back bus instance.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xenbusb_back_attach(device_t dev)
+{
+ struct xenbusb_softc *xbs;
+ int error;
+
+ xbs = device_get_softc(dev);
+ error = xenbusb_attach(dev, "backend", /*id_components*/2);
+
+ /*
+ * Backend devices operate to serve other domains,
+ * so there is no need to hold up boot processing
+ * while connections to foreign domains are made.
+ */
+ mtx_lock(&xbs->xbs_lock);
+ if ((xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) {
+ xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE;
+ mtx_unlock(&xbs->xbs_lock);
+ config_intrhook_disestablish(&xbs->xbs_attach_ch);
+ } else {
+ mtx_unlock(&xbs->xbs_lock);
+ }
+
+ return (error);
+}
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param dev NewBus device_t for this XenBus backend bus instance.
+ * \param type String indicating the device sub-tree (e.g. "vfb", "vif")
+ * to enumerate.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * Devices that are found are entered into the NewBus hierarchy via
+ * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ *
+ * The backend XenStore hierarchy has the following format:
+ *
+ * backend/<device type>/<frontend vm id>/<device id>
+ *
+ */
+static int
+xenbusb_back_enumerate_type(device_t dev, const char *type)
+{
+ struct xenbusb_softc *xbs;
+ const char **vms;
+ u_int vm_idx;
+ u_int vm_count;
+ int error;
+
+ xbs = device_get_softc(dev);
+ error = xs_directory(XST_NIL, xbs->xbs_node, type, &vm_count, &vms);
+ if (error)
+ return (error);
+ for (vm_idx = 0; vm_idx < vm_count; vm_idx++) {
+ struct sbuf *vm_path;
+ const char *vm;
+ const char **devs;
+ u_int dev_idx;
+ u_int dev_count;
+
+ vm = vms[vm_idx];
+
+ vm_path = xs_join(type, vm);
+ error = xs_directory(XST_NIL, xbs->xbs_node, sbuf_data(vm_path),
+ &dev_count, &devs);
+ sbuf_delete(vm_path);
+ if (error)
+ break;
+
+ for (dev_idx = 0; dev_idx < dev_count; dev_idx++) {
+ const char *dev_num;
+ struct sbuf *id;
+
+ dev_num = devs[dev_idx];
+ id = xs_join(vm, dev_num);
+ xenbusb_add_device(dev, type, sbuf_data(id));
+ sbuf_delete(id);
+ }
+ free(devs, M_XENSTORE);
+ }
+
+ free(vms, M_XENSTORE);
+
+ return (0);
+}
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ * a split device whose local end is represented by ivars.
+ *
+ * \param dev NewBus device_t for this XenBus backend bus instance.
+ * \param ivars Instance variables from the XenBus child device for
+ * which to perform this function.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables will be updated.
+ *
+ */
+static int
+xenbusb_back_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars)
+{
+ char *otherend_path;
+ int error;
+
+ if (ivars->xd_otherend_path != NULL) {
+ free(ivars->xd_otherend_path, M_XENBUS);
+ ivars->xd_otherend_path = NULL;
+ }
+
+ error = xs_gather(XST_NIL, ivars->xd_node,
+ "frontend-id", "%i", &ivars->xd_otherend_id,
+ "frontend", NULL, &otherend_path,
+ NULL);
+
+ if (error == 0) {
+ ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS);
+ free(otherend_path, M_XENSTORE);
+ }
+ return (error);
+}
+
+/**
+ * \brief Backend XenBus child instance variable write access method.
+ *
+ * \param dev The NewBus device representing this XenBus bus.
+ * \param child The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index The index of the instance variable to access.
+ * \param value The new value to set in the instance variable accessed.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * Xenbus_back overrides this method so that it can trap state transitions
+ * of local backend devices and clean up their XenStore entries as necessary
+ * during device instance teardown.
+ */
+static int
+xenbusb_back_write_ivar(device_t dev, device_t child, int index,
+ uintptr_t value)
+{
+ int error;
+
+ error = xenbusb_write_ivar(dev, child, index, value);
+
+ if (index == XENBUS_IVAR_STATE
+ && (enum xenbus_state)value == XenbusStateClosed
+ && xenbus_dev_is_online(child) == 0) {
+
+ /*
+ * Cleanup the hotplug entry in the XenStore if
+ * present. The control domain expects any userland
+ * component associated with this device to destroy
+ * this node in order to signify it is safe to
+ * teardown the device. However, not all backends
+ * rely on userland components, and those that
+ * do should either use a communication channel
+ * other than the XenStore, or ensure the hotplug
+ * data is already cleaned up.
+ *
+ * This removal ensures that no matter what path
+ * is taken to mark a back-end closed, the control
+ * domain will understand that it is closed.
+ */
+ xs_rm(XST_NIL, xenbus_get_node(child), "hotplug-status");
+ }
+
+ return (error);
+}
+
+/*-------------------- Private Device Attachment Data -----------------------*/
+static device_method_t xenbusb_back_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, xenbusb_identify),
+ DEVMETHOD(device_probe, xenbusb_back_probe),
+ DEVMETHOD(device_attach, xenbusb_back_attach),
+ DEVMETHOD(device_detach, bus_generic_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus Interface */
+ DEVMETHOD(bus_print_child, xenbusb_print_child),
+ DEVMETHOD(bus_read_ivar, xenbusb_read_ivar),
+ DEVMETHOD(bus_write_ivar, xenbusb_back_write_ivar),
+ DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource),
+ DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+
+ /* XenBus Bus Interface */
+ DEVMETHOD(xenbusb_enumerate_type, xenbusb_back_enumerate_type),
+ DEVMETHOD(xenbusb_get_otherend_node, xenbusb_back_get_otherend_node),
+ { 0, 0 }
+};
+
+DEFINE_CLASS_0(xenbusb_back, xenbusb_back_driver, xenbusb_back_methods,
+ sizeof(struct xenbusb_softc));
+devclass_t xenbusb_back_devclass;
+
+DRIVER_MODULE(xenbusb_back, xenstore, xenbusb_back_driver,
+ xenbusb_back_devclass, 0, 0);
diff --git a/sys/xen/xenbus/xenbusb_front.c b/sys/xen/xenbus/xenbusb_front.c
new file mode 100644
index 0000000..0bc06a4
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_front.c
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2009, 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb_front.c
+ *
+ * XenBus management of the NewBus bus containing the frontend instances of
+ * Xen split devices.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenbus/xenbusb.h>
+
+
+/*------------------ Private Device Attachment Functions --------------------*/
+/**
+ * \brief Probe for the existance of the XenBus front bus.
+ *
+ * \param dev NewBus device_t for this XenBus front bus instance.
+ *
+ * \return Always returns 0 indicating success.
+ */
+static int
+xenbusb_front_probe(device_t dev)
+{
+ device_set_desc(dev, "Xen Frontend Devices");
+
+ return (0);
+}
+
+/**
+ * \brief Attach the XenBus front bus.
+ *
+ * \param dev NewBus device_t for this XenBus front bus instance.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xenbusb_front_attach(device_t dev)
+{
+ return (xenbusb_attach(dev, "device", /*id_components*/1));
+}
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param dev NewBus device_t for this XenBus front bus instance.
+ * \param type String indicating the device sub-tree (e.g. "vfb", "vif")
+ * to enumerate.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * Devices that are found are entered into the NewBus hierarchy via
+ * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ */
+static int
+xenbusb_front_enumerate_type(device_t dev, const char *type)
+{
+ struct xenbusb_softc *xbs;
+ const char **dir;
+ unsigned int i, count;
+ int error;
+
+ xbs = device_get_softc(dev);
+ error = xs_directory(XST_NIL, xbs->xbs_node, type, &count, &dir);
+ if (error)
+ return (error);
+ for (i = 0; i < count; i++)
+ xenbusb_add_device(dev, type, dir[i]);
+
+ free(dir, M_XENSTORE);
+
+ return (0);
+}
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ * a split device whose local end is represented by ivars.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables will be updated.
+ *
+ * \param dev NewBus device_t for this XenBus front bus instance.
+ * \param ivars Instance variables from the XenBus child device for
+ * which to perform this function.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xenbusb_front_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars)
+{
+ char *otherend_path;
+ int error;
+
+ if (ivars->xd_otherend_path != NULL) {
+ free(ivars->xd_otherend_path, M_XENBUS);
+ ivars->xd_otherend_path = NULL;
+ }
+
+ error = xs_gather(XST_NIL, ivars->xd_node,
+ "backend-id", "%i", &ivars->xd_otherend_id,
+ "backend", NULL, &otherend_path,
+ NULL);
+
+ if (error == 0) {
+ ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS);
+ free(otherend_path, M_XENSTORE);
+ }
+ return (error);
+}
+
+/*-------------------- Private Device Attachment Data -----------------------*/
+static device_method_t xenbusb_front_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, xenbusb_identify),
+ DEVMETHOD(device_probe, xenbusb_front_probe),
+ DEVMETHOD(device_attach, xenbusb_front_attach),
+ DEVMETHOD(device_detach, bus_generic_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus Interface */
+ DEVMETHOD(bus_print_child, xenbusb_print_child),
+ DEVMETHOD(bus_read_ivar, xenbusb_read_ivar),
+ DEVMETHOD(bus_write_ivar, xenbusb_write_ivar),
+ DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource),
+ DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+
+ /* XenBus Bus Interface */
+ DEVMETHOD(xenbusb_enumerate_type, xenbusb_front_enumerate_type),
+ DEVMETHOD(xenbusb_get_otherend_node, xenbusb_front_get_otherend_node),
+ { 0, 0 }
+};
+
+DEFINE_CLASS_0(xenbusb_front, xenbusb_front_driver, xenbusb_front_methods,
+ sizeof(struct xenbusb_softc));
+devclass_t xenbusb_front_devclass;
+
+DRIVER_MODULE(xenbusb_front, xenstore, xenbusb_front_driver,
+ xenbusb_front_devclass, 0, 0);
diff --git a/sys/xen/xenbus/xenbusb_if.m b/sys/xen/xenbus/xenbusb_if.m
new file mode 100644
index 0000000..a32e3f6
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_if.m
@@ -0,0 +1,78 @@
+#-
+# Copyright (c) 2010 Spectra Logic Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions, and the following disclaimer,
+# without modification.
+# 2. Redistributions in binary form must reproduce at minimum a disclaimer
+# substantially similar to the "NO WARRANTY" disclaimer below
+# ("Disclaimer") and any redistribution must be conditioned upon
+# including a substantially similar Disclaimer requirement for further
+# binary redistribution.
+#
+# NO WARRANTY
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGES.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+HEADER {
+struct xenbus_device_ivars;
+}
+
+INTERFACE xenbusb;
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param _dev NewBus device_t for this XenBus (front/back) bus instance.
+ * \param _type String indicating the device sub-tree (e.g. "vfb", "vif")
+ * to enumerate.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * Devices that are found should be entered into the NewBus hierarchy via
+ * xenbusb_add_device(). xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ */
+METHOD int enumerate_type {
+ device_t _dev;
+ const char *_type;
+};
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ * a split device whose local end is represented by ivars.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables must be updated.
+ *
+ * \param _dev NewBus device_t for this XenBus (front/back) bus instance.
+ * \param _ivars Instance variables from the XenBus child device for
+ * which to perform this function.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+METHOD int get_otherend_node {
+ device_t _dev;
+ struct xenbus_device_ivars *_ivars;
+}
diff --git a/sys/xen/xenbus/xenbusvar.h b/sys/xen/xenbus/xenbusvar.h
index 6511664..55d7f29 100644
--- a/sys/xen/xenbus/xenbusvar.h
+++ b/sys/xen/xenbus/xenbusvar.h
@@ -1,8 +1,4 @@
/******************************************************************************
- * xenbus.h
- *
- * Talks to Xen Store to figure out what devices we have.
- *
* Copyright (C) 2005 Rusty Russell, IBM Corporation
* Copyright (C) 2005 XenSource Ltd.
*
@@ -30,46 +26,64 @@
* $FreeBSD$
*/
+/**
+ * \file xenbusvar.h
+ *
+ * \brief Datastructures and function declarations for usedby device
+ * drivers operating on the XenBus.
+ */
+
#ifndef _XEN_XENBUS_XENBUSVAR_H
#define _XEN_XENBUS_XENBUSVAR_H
#include <sys/queue.h>
#include <sys/bus.h>
#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
#include <machine/xen/xen-os.h>
+
+#include <xen/interface/grant_table.h>
#include <xen/interface/io/xenbus.h>
#include <xen/interface/io/xs_wire.h>
+#include <xen/xenstore/xenstorevar.h>
+
#include "xenbus_if.h"
+/* XenBus allocations including XenStore data returned to clients. */
+MALLOC_DECLARE(M_XENBUS);
+
enum {
- /*
+ /**
* Path of this device node.
*/
XENBUS_IVAR_NODE,
- /*
+ /**
* The device type (e.g. vif, vbd).
*/
XENBUS_IVAR_TYPE,
- /*
+ /**
* The state of this device (not the otherend's state).
*/
XENBUS_IVAR_STATE,
- /*
+ /**
* Domain ID of the other end device.
*/
XENBUS_IVAR_OTHEREND_ID,
- /*
+ /**
* Path of the other end device.
*/
XENBUS_IVAR_OTHEREND_PATH
};
-/*
+/**
* Simplified accessors for xenbus devices
*/
#define XENBUS_ACCESSOR(var, ivar, type) \
@@ -81,179 +95,184 @@ XENBUS_ACCESSOR(state, STATE, enum xenbus_state)
XENBUS_ACCESSOR(otherend_id, OTHEREND_ID, int)
XENBUS_ACCESSOR(otherend_path, OTHEREND_PATH, const char *)
-/* Register callback to watch this node. */
-struct xenbus_watch
-{
- LIST_ENTRY(xenbus_watch) list;
-
- /* Path being watched. */
- char *node;
-
- /* Callback (executed in a process context with no locks held). */
- void (*callback)(struct xenbus_watch *,
- const char **vec, unsigned int len);
-};
-
-typedef int (*xenstore_event_handler_t)(void *);
-
-struct xenbus_transaction
-{
- uint32_t id;
-};
-
-#define XBT_NIL ((struct xenbus_transaction) { 0 })
-
-int xenbus_directory(struct xenbus_transaction t, const char *dir,
- const char *node, unsigned int *num, char ***result);
-int xenbus_read(struct xenbus_transaction t, const char *dir,
- const char *node, unsigned int *len, void **result);
-int xenbus_write(struct xenbus_transaction t, const char *dir,
- const char *node, const char *string);
-int xenbus_mkdir(struct xenbus_transaction t, const char *dir,
- const char *node);
-int xenbus_exists(struct xenbus_transaction t, const char *dir,
- const char *node);
-int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
-int xenbus_transaction_start(struct xenbus_transaction *t);
-int xenbus_transaction_end(struct xenbus_transaction t, int abort);
-
-/*
- * Single read and scanf: returns errno or zero. If scancountp is
- * non-null, then number of items scanned is returned in *scanncountp.
- */
-int xenbus_scanf(struct xenbus_transaction t,
- const char *dir, const char *node, int *scancountp, const char *fmt, ...)
- __attribute__((format(scanf, 5, 6)));
-
-/* Single printf and write: returns errno or 0. */
-int xenbus_printf(struct xenbus_transaction t,
- const char *dir, const char *node, const char *fmt, ...)
- __attribute__((format(printf, 4, 5)));
-
-/*
- * Generic read function: NULL-terminated triples of name,
- * sprintf-style type string, and pointer. Returns 0 or errno.
+/**
+ * Return the state of a XenBus device.
+ *
+ * \param path The root XenStore path for the device.
+ *
+ * \return The current state of the device or XenbusStateClosed if no
+ * state can be read.
*/
-int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
-
-/* notifer routines for when the xenstore comes up */
-int register_xenstore_notifier(xenstore_event_handler_t func, void *arg, int priority);
-#if 0
-void unregister_xenstore_notifier();
-#endif
-int register_xenbus_watch(struct xenbus_watch *watch);
-void unregister_xenbus_watch(struct xenbus_watch *watch);
-void xs_suspend(void);
-void xs_resume(void);
-
-/* Used by xenbus_dev to borrow kernel's store connection. */
-int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result);
-
-#if 0
-
-#define XENBUS_IS_ERR_READ(str) ({ \
- if (!IS_ERR(str) && strlen(str) == 0) { \
- free(str, M_DEVBUF); \
- str = ERR_PTR(-ERANGE); \
- } \
- IS_ERR(str); \
-})
-
-#endif
-
-#define XENBUS_EXIST_ERR(err) ((err) == ENOENT || (err) == ERANGE)
-
+XenbusState xenbus_read_driver_state(const char *path);
/**
- * Register a watch on the given path, using the given xenbus_watch structure
- * for storage, and the given callback function as the callback. Return 0 on
- * success, or errno on error. On success, the given path will be saved as
- * watch->node, and remains the caller's to free. On error, watch->node will
- * be NULL, the device will switch to XenbusStateClosing, and the error will
- * be saved in the store.
+ * Initialize and register a watch on the given path (client suplied storage).
+ *
+ * \param dev The XenBus device requesting the watch service.
+ * \param path The XenStore path of the object to be watched. The
+ * storage for this string must be stable for the lifetime
+ * of the watch.
+ * \param watch The watch object to use for this request. This object
+ * must be stable for the lifetime of the watch.
+ * \param callback The function to call when XenStore objects at or below
+ * path are modified.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * \note On error, the device 'dev' will be switched to the XenbusStateClosing
+ * state and the returned error is saved in the per-device error node
+ * for dev in the XenStore.
*/
int xenbus_watch_path(device_t dev, char *path,
- struct xenbus_watch *watch,
- void (*callback)(struct xenbus_watch *,
- const char **, unsigned int));
-
+ struct xs_watch *watch,
+ xs_watch_cb_t *callback);
/**
- * Register a watch on the given path/path2, using the given xenbus_watch
- * structure for storage, and the given callback function as the callback.
- * Return 0 on success, or errno on error. On success, the watched path
- * (path/path2) will be saved as watch->node, and becomes the caller's to
- * kfree(). On error, watch->node will be NULL, so the caller has nothing to
- * free, the device will switch to XenbusStateClosing, and the error will be
- * saved in the store.
+ * Initialize and register a watch at path/path2 in the XenStore.
+ *
+ * \param dev The XenBus device requesting the watch service.
+ * \param path The base XenStore path of the object to be watched.
+ * \param path2 The tail XenStore path of the object to be watched.
+ * \param watch The watch object to use for this request. This object
+ * must be stable for the lifetime of the watch.
+ * \param callback The function to call when XenStore objects at or below
+ * path are modified.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * \note On error, \a dev will be switched to the XenbusStateClosing
+ * state and the returned error is saved in the per-device error node
+ * for \a dev in the XenStore.
+ *
+ * Similar to xenbus_watch_path, however the storage for the path to the
+ * watched object is allocated from the heap and filled with "path '/' path2".
+ * Should a call to this function succeed, it is the callers responsibility
+ * to free watch->node using the M_XENBUS malloc type.
*/
int xenbus_watch_path2(device_t dev, const char *path,
- const char *path2, struct xenbus_watch *watch,
- void (*callback)(struct xenbus_watch *,
- const char **, unsigned int));
-
+ const char *path2, struct xs_watch *watch,
+ xs_watch_cb_t *callback);
/**
- * Advertise in the store a change of the given driver to the given new_state.
- * which case this is performed inside its own transaction. Return 0 on
- * success, or errno on error. On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
+ * Grant access to the given ring_mfn to the peer of the given device.
+ *
+ * \param dev The device granting access to the ring page.
+ * \param ring_mfn The guest machine page number of the page to grant
+ * peer access rights.
+ * \param refp[out] The grant reference for the page.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * A successful call to xenbus_grant_ring should be paired with a call
+ * to gnttab_end_foreign_access() when foregn access to this page is no
+ * longer requried.
+ *
+ * \note On error, \a dev will be switched to the XenbusStateClosing
+ * state and the returned error is saved in the per-device error node
+ * for \a dev in the XenStore.
*/
-int xenbus_switch_state(device_t dev,
- XenbusState new_state);
-
+int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp);
/**
- * Grant access to the given ring_mfn to the peer of the given device.
- * Return 0 on success, or errno on error. On error, the device will
- * switch to XenbusStateClosing, and the error will be saved in the
- * store. The grant ring reference is returned in *refp.
+ * Allocate an event channel for the given XenBus device.
+ *
+ * \param dev The device for which to allocate the event channel.
+ * \param port[out] The port identifier for the allocated event channel.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * A successfully allocated event channel should be free'd using
+ * xenbus_free_evtchn().
+ *
+ * \note On error, \a dev will be switched to the XenbusStateClosing
+ * state and the returned error is saved in the per-device error node
+ * for \a dev in the XenStore.
*/
-int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp);
-
+int xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port);
/**
- * Allocate an event channel for the given xenbus_device, assigning the newly
- * created local port to *port. Return 0 on success, or errno on error. On
- * error, the device will switch to XenbusStateClosing, and the error will be
- * saved in the store.
+ * Free an existing event channel.
+ *
+ * \param dev The device which allocated this event channel.
+ * \param port The port identifier for the event channel to free.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * \note On error, \a dev will be switched to the XenbusStateClosing
+ * state and the returned error is saved in the per-device error node
+ * for \a dev in the XenStore.
*/
-int xenbus_alloc_evtchn(device_t dev, int *port);
-
+int xenbus_free_evtchn(device_t dev, evtchn_port_t port);
/**
- * Free an existing event channel. Returns 0 on success or errno on error.
+ * Record the given errno, along with the given, printf-style, formatted
+ * message in dev's device specific error node in the XenStore.
+ *
+ * \param dev The device which encountered the error.
+ * \param err The errno value corresponding to the error.
+ * \param fmt Printf format string followed by a variable number of
+ * printf arguments.
*/
-int xenbus_free_evtchn(device_t dev, int port);
-
+void xenbus_dev_error(device_t dev, int err, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
/**
- * Return the state of the driver rooted at the given store path, or
- * XenbusStateClosed if no state can be read.
+ * va_list version of xenbus_dev_error().
+ *
+ * \param dev The device which encountered the error.
+ * \param err The errno value corresponding to the error.
+ * \param fmt Printf format string.
+ * \param ap Va_list of printf arguments.
*/
-XenbusState xenbus_read_driver_state(const char *path);
+void xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap)
+ __attribute__((format(printf, 3, 0)));
-
-/***
- * Report the given negative errno into the store, along with the given
- * formatted message.
+/**
+ * Equivalent to xenbus_dev_error(), followed by
+ * xenbus_set_state(dev, XenbusStateClosing).
+ *
+ * \param dev The device which encountered the error.
+ * \param err The errno value corresponding to the error.
+ * \param fmt Printf format string followed by a variable number of
+ * printf arguments.
*/
-void xenbus_dev_error(device_t dev, int err, const char *fmt,
- ...);
-
+void xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
-/***
- * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
- * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
- * closedown of this driver and its peer.
+/**
+ * va_list version of xenbus_dev_fatal().
+ *
+ * \param dev The device which encountered the error.
+ * \param err The errno value corresponding to the error.
+ * \param fmt Printf format string.
+ * \param ap Va_list of printf arguments.
*/
-void xenbus_dev_fatal(device_t dev, int err, const char *fmt,
- ...);
-
-int xenbus_dev_init(void);
+void xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list)
+ __attribute__((format(printf, 3, 0)));
+/**
+ * Convert a member of the xenbus_state enum into an ASCII string.
+ *
+ * /param state The XenBus state to lookup.
+ *
+ * /return A string representing state or, for unrecognized states,
+ * the string "Unknown".
+ */
const char *xenbus_strstate(enum xenbus_state state);
+
+/**
+ * Return the value of a XenBus device's "online" node within the XenStore.
+ *
+ * \param dev The XenBus device to query.
+ *
+ * \return The value of the "online" node for the device. If the node
+ * does not exist, 0 (offline) is returned.
+ */
int xenbus_dev_is_online(device_t dev);
-int xenbus_frontend_closed(device_t dev);
#endif /* _XEN_XENBUS_XENBUSVAR_H */
diff --git a/sys/xen/xenstore/xenstore.c b/sys/xen/xenstore/xenstore.c
new file mode 100644
index 0000000..76dfb5a
--- /dev/null
+++ b/sys/xen/xenstore/xenstore.c
@@ -0,0 +1,1654 @@
+/******************************************************************************
+ * xenstore.c
+ *
+ * Low-level kernel interface to the XenStore.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2009,2010 Spectra Logic Corporation
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/syslog.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/hvm/params.h>
+
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenstore/xenstore_internal.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+/**
+ * \file xenstore.c
+ * \brief XenStore interface
+ *
+ * The XenStore interface is a simple storage system that is a means of
+ * communicating state and configuration data between the Xen Domain 0
+ * and the various guest domains. All configuration data other than
+ * a small amount of essential information required during the early
+ * boot process of launching a Xen aware guest, is managed using the
+ * XenStore.
+ *
+ * The XenStore is ASCII string based, and has a structure and semantics
+ * similar to a filesystem. There are files and directories, the directories
+ * able to contain files or other directories. The depth of the hierachy
+ * is only limited by the XenStore's maximum path length.
+ *
+ * The communication channel between the XenStore service and other
+ * domains is via two, guest specific, ring buffers in a shared memory
+ * area. One ring buffer is used for communicating in each direction.
+ * The grant table references for this shared memory are given to the
+ * guest either via the xen_start_info structure for a fully para-
+ * virtualized guest, or via HVM hypercalls for a hardware virtualized
+ * guest.
+ *
+ * The XenStore communication relies on an event channel and thus
+ * interrupts. For this reason, the attachment of the XenStore
+ * relies on an interrupt driven configuration hook to hold off
+ * boot processing until communication with the XenStore service
+ * can be established.
+ *
+ * Several Xen services depend on the XenStore, most notably the
+ * XenBus used to discover and manage Xen devices. These services
+ * are implemented as NewBus child attachments to a bus exported
+ * by this XenStore driver.
+ */
+
+static struct xs_watch *find_watch(const char *token);
+
+MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results");
+
+/**
+ * Pointer to shared memory communication structures allowing us
+ * to communicate with the XenStore service.
+ *
+ * When operating in full PV mode, this pointer is set early in kernel
+ * startup from within xen_machdep.c. In HVM mode, we use hypercalls
+ * to get the guest frame number for the shared page and then map it
+ * into kva. See xs_init() for details.
+ */
+struct xenstore_domain_interface *xen_store;
+
+/*-------------------------- Private Data Structures ------------------------*/
+
+/**
+ * Structure capturing messages received from the XenStore service.
+ */
+struct xs_stored_msg {
+ TAILQ_ENTRY(xs_stored_msg) list;
+
+ struct xsd_sockmsg hdr;
+
+ union {
+ /* Queued replies. */
+ struct {
+ char *body;
+ } reply;
+
+ /* Queued watch events. */
+ struct {
+ struct xs_watch *handle;
+ const char **vec;
+ u_int vec_size;
+ } watch;
+ } u;
+};
+TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg);
+
+/**
+ * Container for all XenStore related state.
+ */
+struct xs_softc {
+ /** Newbus device for the XenStore. */
+ device_t xs_dev;
+
+ /**
+ * Lock serializing access to ring producer/consumer
+ * indexes. Use of this lock guarantees that wakeups
+ * of blocking readers/writers are not missed due to
+ * races with the XenStore service.
+ */
+ struct mtx ring_lock;
+
+ /*
+ * Mutex used to insure exclusive access to the outgoing
+ * communication ring. We use a lock type that can be
+ * held while sleeping so that xs_write() can block waiting
+ * for space in the ring to free up, without allowing another
+ * writer to come in and corrupt a partial message write.
+ */
+ struct sx request_mutex;
+
+ /**
+ * A list of replies to our requests.
+ *
+ * The reply list is filled by xs_rcv_thread(). It
+ * is consumed by the context that issued the request
+ * to which a reply is made. The requester blocks in
+ * xs_read_reply().
+ *
+ * /note Only one requesting context can be active at a time.
+ * This is guaranteed by the request_mutex and insures
+ * that the requester sees replies matching the order
+ * of its requests.
+ */
+ struct xs_stored_msg_list reply_list;
+
+ /** Lock protecting the reply list. */
+ struct mtx reply_lock;
+
+ /**
+ * List of registered watches.
+ */
+ struct xs_watch_list registered_watches;
+
+ /** Lock protecting the registered watches list. */
+ struct mtx registered_watches_lock;
+
+ /**
+ * List of pending watch callback events.
+ */
+ struct xs_stored_msg_list watch_events;
+
+ /** Lock protecting the watch calback list. */
+ struct mtx watch_events_lock;
+
+ /**
+ * Sleepable lock used to prevent VM suspension while a
+ * xenstore transaction is outstanding.
+ *
+ * Each active transaction holds a shared lock on the
+ * suspend mutex. Our suspend method blocks waiting
+ * to acquire an exclusive lock. This guarantees that
+ * suspend processing will only proceed once all active
+ * transactions have been retired.
+ */
+ struct sx suspend_mutex;
+
+ /**
+ * The processid of the xenwatch thread.
+ */
+ pid_t xenwatch_pid;
+
+ /**
+ * Sleepable mutex used to gate the execution of XenStore
+ * watch event callbacks.
+ *
+ * xenwatch_thread holds an exclusive lock on this mutex
+ * while delivering event callbacks, and xenstore_unregister_watch()
+ * uses an exclusive lock of this mutex to guarantee that no
+ * callbacks of the just unregistered watch are pending
+ * before returning to its caller.
+ */
+ struct sx xenwatch_mutex;
+
+#ifdef XENHVM
+ /**
+ * The HVM guest pseudo-physical frame number. This is Xen's mapping
+ * of the true machine frame number into our "physical address space".
+ */
+ unsigned long gpfn;
+#endif
+
+ /**
+ * The event channel for communicating with the
+ * XenStore service.
+ */
+ int evtchn;
+
+ /** Interrupt number for our event channel. */
+ u_int irq;
+
+ /**
+ * Interrupt driven config hook allowing us to defer
+ * attaching children until interrupts (and thus communication
+ * with the XenStore service) are available.
+ */
+ struct intr_config_hook xs_attachcb;
+};
+
+/*-------------------------------- Global Data ------------------------------*/
+static struct xs_softc xs;
+
+/*------------------------- Private Utility Functions -----------------------*/
+
+/**
+ * Count and optionally record pointers to a number of NUL terminated
+ * strings in a buffer.
+ *
+ * \param strings A pointer to a contiguous buffer of NUL terminated strings.
+ * \param dest An array to store pointers to each string found in strings.
+ * \param len The length of the buffer pointed to by strings.
+ *
+ * \return A count of the number of strings found.
+ */
+static u_int
+extract_strings(const char *strings, const char **dest, u_int len)
+{
+ u_int num;
+ const char *p;
+
+ for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) {
+ if (dest != NULL)
+ *dest++ = p;
+ num++;
+ }
+
+ return (num);
+}
+
+/**
+ * Convert a contiguous buffer containing a series of NUL terminated
+ * strings into an array of pointers to strings.
+ *
+ * The returned pointer references the array of string pointers which
+ * is followed by the storage for the string data. It is the client's
+ * responsibility to free this storage.
+ *
+ * The storage addressed by strings is free'd prior to split returning.
+ *
+ * \param strings A pointer to a contiguous buffer of NUL terminated strings.
+ * \param len The length of the buffer pointed to by strings.
+ * \param num The number of strings found and returned in the strings
+ * array.
+ *
+ * \return An array of pointers to the strings found in the input buffer.
+ */
+static const char **
+split(char *strings, u_int len, u_int *num)
+{
+ const char **ret;
+
+ /* Protect against unterminated buffers. */
+ strings[len - 1] = '\0';
+
+ /* Count the strings. */
+ *num = extract_strings(strings, /*dest*/NULL, len);
+
+ /* Transfer to one big alloc for easy freeing by the caller. */
+ ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK);
+ memcpy(&ret[*num], strings, len);
+ free(strings, M_XENSTORE);
+
+ /* Extract pointers to newly allocated array. */
+ strings = (char *)&ret[*num];
+ (void)extract_strings(strings, /*dest*/ret, len);
+
+ return (ret);
+}
+
+/*------------------------- Public Utility Functions -------------------------*/
+/*------- API comments for these methods can be found in xenstorevar.h -------*/
+struct sbuf *
+xs_join(const char *dir, const char *name)
+{
+ struct sbuf *sb;
+
+ sb = sbuf_new_auto();
+ sbuf_cat(sb, dir);
+ if (name[0] != '\0') {
+ sbuf_putc(sb, '/');
+ sbuf_cat(sb, name);
+ }
+ sbuf_finish(sb);
+
+ return (sb);
+}
+
+/*-------------------- Low Level Communication Management --------------------*/
+/**
+ * Interrupt handler for the XenStore event channel.
+ *
+ * XenStore reads and writes block on "xen_store" for buffer
+ * space. Wakeup any blocking operations when the XenStore
+ * service has modified the queues.
+ */
+static void
+xs_intr(void * arg __unused /*__attribute__((unused))*/)
+{
+
+ /*
+ * Hold ring lock across wakeup so that clients
+ * cannot miss a wakeup.
+ */
+ mtx_lock(&xs.ring_lock);
+ wakeup(xen_store);
+ mtx_unlock(&xs.ring_lock);
+}
+
+/**
+ * Verify that the indexes for a ring are valid.
+ *
+ * The difference between the producer and consumer cannot
+ * exceed the size of the ring.
+ *
+ * \param cons The consumer index for the ring to test.
+ * \param prod The producer index for the ring to test.
+ *
+ * \retval 1 If indexes are in range.
+ * \retval 0 If the indexes are out of range.
+ */
+static int
+xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+
+ return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+
+/**
+ * Return a pointer to, and the length of, the contiguous
+ * free region available for output in a ring buffer.
+ *
+ * \param cons The consumer index for the ring.
+ * \param prod The producer index for the ring.
+ * \param buf The base address of the ring's storage.
+ * \param len The amount of contiguous storage available.
+ *
+ * \return A pointer to the start location of the free region.
+ */
+static void *
+xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
+ char *buf, uint32_t *len)
+{
+
+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+ if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+ *len = XENSTORE_RING_SIZE - (prod - cons);
+ return (buf + MASK_XENSTORE_IDX(prod));
+}
+
+/**
+ * Return a pointer to, and the length of, the contiguous
+ * data available to read from a ring buffer.
+ *
+ * \param cons The consumer index for the ring.
+ * \param prod The producer index for the ring.
+ * \param buf The base address of the ring's storage.
+ * \param len The amount of contiguous data available to read.
+ *
+ * \return A pointer to the start location of the available data.
+ */
+static const void *
+xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
+ const char *buf, uint32_t *len)
+{
+
+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+ if ((prod - cons) < *len)
+ *len = prod - cons;
+ return (buf + MASK_XENSTORE_IDX(cons));
+}
+
+/**
+ * Transmit data to the XenStore service.
+ *
+ * \param tdata A pointer to the contiguous data to send.
+ * \param len The amount of data to send.
+ *
+ * \return On success 0, otherwise an errno value indicating the
+ * cause of failure.
+ *
+ * \invariant Called from thread context.
+ * \invariant The buffer pointed to by tdata is at least len bytes
+ * in length.
+ * \invariant xs.request_mutex exclusively locked.
+ */
+static int
+xs_write_store(const void *tdata, unsigned len)
+{
+ XENSTORE_RING_IDX cons, prod;
+ const char *data = (const char *)tdata;
+ int error;
+
+ sx_assert(&xs.request_mutex, SX_XLOCKED);
+ while (len != 0) {
+ void *dst;
+ u_int avail;
+
+ /* Hold lock so we can't miss wakeups should we block. */
+ mtx_lock(&xs.ring_lock);
+ cons = xen_store->req_cons;
+ prod = xen_store->req_prod;
+ if ((prod - cons) == XENSTORE_RING_SIZE) {
+ /*
+ * Output ring is full. Wait for a ring event.
+ *
+ * Note that the events from both queues
+ * are combined, so being woken does not
+ * guarantee that data exist in the read
+ * ring.
+ *
+ * To simplify error recovery and the retry,
+ * we specify PDROP so our lock is *not* held
+ * when msleep returns.
+ */
+ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
+ "xbwrite", /*timeout*/0);
+ if (error && error != EWOULDBLOCK)
+ return (error);
+
+ /* Try again. */
+ continue;
+ }
+ mtx_unlock(&xs.ring_lock);
+
+ /* Verify queue sanity. */
+ if (!xs_check_indexes(cons, prod)) {
+ xen_store->req_cons = xen_store->req_prod = 0;
+ return (EIO);
+ }
+
+ dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail);
+ if (avail > len)
+ avail = len;
+
+ memcpy(dst, data, avail);
+ data += avail;
+ len -= avail;
+
+ /*
+ * The store to the producer index, which indicates
+ * to the other side that new data has arrived, must
+ * be visible only after our copy of the data into the
+ * ring has completed.
+ */
+ wmb();
+ xen_store->req_prod += avail;
+
+ /*
+ * notify_remote_via_evtchn implies mb(). The other side
+ * will see the change to req_prod at the time of the
+ * interrupt.
+ */
+ notify_remote_via_evtchn(xs.evtchn);
+ }
+
+ return (0);
+}
+
+/**
+ * Receive data from the XenStore service.
+ *
+ * \param tdata A pointer to the contiguous buffer to receive the data.
+ * \param len The amount of data to receive.
+ *
+ * \return On success 0, otherwise an errno value indicating the
+ * cause of failure.
+ *
+ * \invariant Called from thread context.
+ * \invariant The buffer pointed to by tdata is at least len bytes
+ * in length.
+ *
+ * \note xs_read does not perform any internal locking to guarantee
+ * serial access to the incoming ring buffer. However, there
+ * is only one context processing reads: xs_rcv_thread().
+ */
+static int
+xs_read_store(void *tdata, unsigned len)
+{
+ XENSTORE_RING_IDX cons, prod;
+ char *data = (char *)tdata;
+ int error;
+
+ while (len != 0) {
+ u_int avail;
+ const char *src;
+
+ /* Hold lock so we can't miss wakeups should we block. */
+ mtx_lock(&xs.ring_lock);
+ cons = xen_store->rsp_cons;
+ prod = xen_store->rsp_prod;
+ if (cons == prod) {
+ /*
+ * Nothing to read. Wait for a ring event.
+ *
+ * Note that the events from both queues
+ * are combined, so being woken does not
+ * guarantee that data exist in the read
+ * ring.
+ *
+ * To simplify error recovery and the retry,
+ * we specify PDROP so our lock is *not* held
+ * when msleep returns.
+ */
+ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
+ "xbread", /*timout*/0);
+ if (error && error != EWOULDBLOCK)
+ return (error);
+ continue;
+ }
+ mtx_unlock(&xs.ring_lock);
+
+ /* Verify queue sanity. */
+ if (!xs_check_indexes(cons, prod)) {
+ xen_store->rsp_cons = xen_store->rsp_prod = 0;
+ return (EIO);
+ }
+
+ src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail);
+ if (avail > len)
+ avail = len;
+
+ /*
+ * Insure the data we read is related to the indexes
+ * we read above.
+ */
+ rmb();
+
+ memcpy(data, src, avail);
+ data += avail;
+ len -= avail;
+
+ /*
+ * Insure that the producer of this ring does not see
+ * the ring space as free until after we have copied it
+ * out.
+ */
+ mb();
+ xen_store->rsp_cons += avail;
+
+ /*
+ * notify_remote_via_evtchn implies mb(). The producer
+ * will see the updated consumer index when the event
+ * is delivered.
+ */
+ notify_remote_via_evtchn(xs.evtchn);
+ }
+
+ return (0);
+}
+
+/*----------------------- Received Message Processing ------------------------*/
+/**
+ * Block reading the next message from the XenStore service and
+ * process the result.
+ *
+ * \param type The returned type of the XenStore message received.
+ *
+ * \return 0 on success. Otherwise an errno value indicating the
+ * type of failure encountered.
+ */
+static int
+xs_process_msg(enum xsd_sockmsg_type *type)
+{
+ struct xs_stored_msg *msg;
+ char *body;
+ int error;
+
+ msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK);
+ error = xs_read_store(&msg->hdr, sizeof(msg->hdr));
+ if (error) {
+ free(msg, M_XENSTORE);
+ return (error);
+ }
+
+ body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK);
+ error = xs_read_store(body, msg->hdr.len);
+ if (error) {
+ free(body, M_XENSTORE);
+ free(msg, M_XENSTORE);
+ return (error);
+ }
+ body[msg->hdr.len] = '\0';
+
+ *type = msg->hdr.type;
+ if (msg->hdr.type == XS_WATCH_EVENT) {
+ msg->u.watch.vec = split(body, msg->hdr.len,
+ &msg->u.watch.vec_size);
+
+ mtx_lock(&xs.registered_watches_lock);
+ msg->u.watch.handle = find_watch(
+ msg->u.watch.vec[XS_WATCH_TOKEN]);
+ if (msg->u.watch.handle != NULL) {
+ mtx_lock(&xs.watch_events_lock);
+ TAILQ_INSERT_TAIL(&xs.watch_events, msg, list);
+ wakeup(&xs.watch_events);
+ mtx_unlock(&xs.watch_events_lock);
+ } else {
+ free(msg->u.watch.vec, M_XENSTORE);
+ free(msg, M_XENSTORE);
+ }
+ mtx_unlock(&xs.registered_watches_lock);
+ } else {
+ msg->u.reply.body = body;
+ mtx_lock(&xs.reply_lock);
+ TAILQ_INSERT_TAIL(&xs.reply_list, msg, list);
+ wakeup(&xs.reply_list);
+ mtx_unlock(&xs.reply_lock);
+ }
+
+ return (0);
+}
+
+/**
+ * Thread body of the XenStore receive thread.
+ *
+ * This thread blocks waiting for data from the XenStore service
+ * and processes and received messages.
+ */
+static void
+xs_rcv_thread(void *arg __unused)
+{
+ int error;
+ enum xsd_sockmsg_type type;
+
+ for (;;) {
+ error = xs_process_msg(&type);
+ if (error)
+ printf("XENSTORE error %d while reading message\n",
+ error);
+ }
+}
+
+/*---------------- XenStore Message Request/Reply Processing -----------------*/
+/**
+ * Filter invoked before transmitting any message to the XenStore service.
+ *
+ * The role of the filter may expand, but currently serves to manage
+ * the interactions of messages with transaction state.
+ *
+ * \param request_msg_type The message type for the request.
+ */
+static inline void
+xs_request_filter(uint32_t request_msg_type)
+{
+ if (request_msg_type == XS_TRANSACTION_START)
+ sx_slock(&xs.suspend_mutex);
+}
+
+/**
+ * Filter invoked after transmitting any message to the XenStore service.
+ *
+ * The role of the filter may expand, but currently serves to manage
+ * the interactions of messages with transaction state.
+ *
+ * \param request_msg_type The message type for the original request.
+ * \param reply_msg_type The message type for any received reply.
+ * \param request_reply_error The error status from the attempt to send
+ * the request or retrieve the reply.
+ */
+static inline void
+xs_reply_filter(uint32_t request_msg_type,
+ uint32_t reply_msg_type, int request_reply_error)
+{
+ /*
+ * The count of transactions drops if we attempted
+ * to end a transaction (even if that attempt fails
+ * in error), we receive a transaction end acknowledgement
+ * or if our attempt to begin a transactionfails.
+ */
+ if (request_msg_type == XS_TRANSACTION_END
+ || (request_reply_error == 0 && reply_msg_type == XS_TRANSACTION_END)
+ || (request_msg_type == XS_TRANSACTION_START
+ && (request_reply_error != 0 || reply_msg_type == XS_ERROR)))
+ sx_sunlock(&xs.suspend_mutex);
+
+}
+
+#define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0]))
+
+/**
+ * Convert a XenStore error string into an errno number.
+ *
+ * \param errorstring The error string to convert.
+ *
+ * \return The errno best matching the input string.
+ *
+ * \note Unknown error strings are converted to EINVAL.
+ */
+static int
+xs_get_error(const char *errorstring)
+{
+ u_int i;
+
+ for (i = 0; i < xsd_error_count; i++) {
+ if (!strcmp(errorstring, xsd_errors[i].errstring))
+ return (xsd_errors[i].errnum);
+ }
+ log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s",
+ errorstring);
+ return (EINVAL);
+}
+
+/**
+ * Block waiting for a reply to a message request.
+ *
+ * \param type The returned type of the reply.
+ * \param len The returned body length of the reply.
+ * \param result The returned body of the reply.
+ *
+ * \return 0 on success. Otherwise an errno indicating the
+ * cause of failure.
+ */
+static int
+xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result)
+{
+ struct xs_stored_msg *msg;
+ char *body;
+ int error;
+
+ mtx_lock(&xs.reply_lock);
+ while (TAILQ_EMPTY(&xs.reply_list)) {
+ error = mtx_sleep(&xs.reply_list, &xs.reply_lock,
+ PCATCH, "xswait", hz/10);
+ if (error && error != EWOULDBLOCK) {
+ mtx_unlock(&xs.reply_lock);
+ return (error);
+ }
+ }
+ msg = TAILQ_FIRST(&xs.reply_list);
+ TAILQ_REMOVE(&xs.reply_list, msg, list);
+ mtx_unlock(&xs.reply_lock);
+
+ *type = msg->hdr.type;
+ if (len)
+ *len = msg->hdr.len;
+ body = msg->u.reply.body;
+
+ free(msg, M_XENSTORE);
+ *result = body;
+ return (0);
+}
+
+/**
+ * Pass-thru interface for XenStore access by userland processes
+ * via the XenStore device.
+ *
+ * Reply type and length data are returned by overwriting these
+ * fields in the passed in request message.
+ *
+ * \param msg A properly formatted message to transmit to
+ * the XenStore service.
+ * \param result The returned body of the reply.
+ *
+ * \return 0 on success. Otherwise an errno indicating the cause
+ * of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ * must be free'd by the caller with 'free(result, M_XENSTORE);
+ */
+int
+xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
+{
+ uint32_t request_type;
+ int error;
+
+ request_type = msg->type;
+ xs_request_filter(request_type);
+
+ sx_xlock(&xs.request_mutex);
+ if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0)
+ error = xs_read_reply(&msg->type, &msg->len, result);
+ sx_xunlock(&xs.request_mutex);
+
+ xs_reply_filter(request_type, msg->type, error);
+
+ return (error);
+}
+
+/**
+ * Send a message with an optionally muti-part body to the XenStore service.
+ *
+ * \param t The transaction to use for this request.
+ * \param request_type The type of message to send.
+ * \param iovec Pointers to the body sections of the request.
+ * \param num_vecs The number of body sections in the request.
+ * \param len The returned length of the reply.
+ * \param result The returned body of the reply.
+ *
+ * \return 0 on success. Otherwise an errno indicating
+ * the cause of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ * must be free'd by the caller with 'free(*result, M_XENSTORE);
+ */
+static int
+xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type,
+ const struct iovec *iovec, u_int num_vecs, u_int *len, void **result)
+{
+ struct xsd_sockmsg msg;
+ void *ret = NULL;
+ u_int i;
+ int error;
+
+ msg.tx_id = t.id;
+ msg.req_id = 0;
+ msg.type = request_type;
+ msg.len = 0;
+ for (i = 0; i < num_vecs; i++)
+ msg.len += iovec[i].iov_len;
+
+ xs_request_filter(request_type);
+
+ sx_xlock(&xs.request_mutex);
+ error = xs_write_store(&msg, sizeof(msg));
+ if (error) {
+ printf("xs_talkv failed %d\n", error);
+ goto error_lock_held;
+ }
+
+ for (i = 0; i < num_vecs; i++) {
+ error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len);
+ if (error) {
+ printf("xs_talkv failed %d\n", error);
+ goto error_lock_held;
+ }
+ }
+
+ error = xs_read_reply(&msg.type, len, &ret);
+
+error_lock_held:
+ sx_xunlock(&xs.request_mutex);
+ xs_reply_filter(request_type, msg.type, error);
+ if (error)
+ return (error);
+
+ if (msg.type == XS_ERROR) {
+ error = xs_get_error(ret);
+ free(ret, M_XENSTORE);
+ return (error);
+ }
+
+ /* Reply is either error or an echo of our request message type. */
+ KASSERT(msg.type == request_type, ("bad xenstore message type"));
+
+ if (result)
+ *result = ret;
+ else
+ free(ret, M_XENSTORE);
+
+ return (0);
+}
+
+/**
+ * Wrapper for xs_talkv allowing easy transmission of a message with
+ * a single, contiguous, message body.
+ *
+ * \param t The transaction to use for this request.
+ * \param request_type The type of message to send.
+ * \param body The body of the request.
+ * \param len The returned length of the reply.
+ * \param result The returned body of the reply.
+ *
+ * \return 0 on success. Otherwise an errno indicating
+ * the cause of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ * must be free'd by the caller with 'free(*result, M_XENSTORE);
+ */
+static int
+xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type,
+ const char *body, u_int *len, void **result)
+{
+ struct iovec iovec;
+
+ iovec.iov_base = (void *)(uintptr_t)body;
+ iovec.iov_len = strlen(body) + 1;
+
+ return (xs_talkv(t, request_type, &iovec, 1, len, result));
+}
+
+/*------------------------- XenStore Watch Support ---------------------------*/
+/**
+ * Transmit a watch request to the XenStore service.
+ *
+ * \param path The path in the XenStore to watch.
+ * \param tocken A unique identifier for this watch.
+ *
+ * \return 0 on success. Otherwise an errno indicating the
+ * cause of failure.
+ */
+static int
+xs_watch(const char *path, const char *token)
+{
+ struct iovec iov[2];
+
+ iov[0].iov_base = (void *)(uintptr_t) path;
+ iov[0].iov_len = strlen(path) + 1;
+ iov[1].iov_base = (void *)(uintptr_t) token;
+ iov[1].iov_len = strlen(token) + 1;
+
+ return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL));
+}
+
+/**
+ * Transmit an uwatch request to the XenStore service.
+ *
+ * \param path The path in the XenStore to watch.
+ * \param tocken A unique identifier for this watch.
+ *
+ * \return 0 on success. Otherwise an errno indicating the
+ * cause of failure.
+ */
+static int
+xs_unwatch(const char *path, const char *token)
+{
+ struct iovec iov[2];
+
+ iov[0].iov_base = (void *)(uintptr_t) path;
+ iov[0].iov_len = strlen(path) + 1;
+ iov[1].iov_base = (void *)(uintptr_t) token;
+ iov[1].iov_len = strlen(token) + 1;
+
+ return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
+}
+
+/**
+ * Convert from watch token (unique identifier) to the associated
+ * internal tracking structure for this watch.
+ *
+ * \param tocken The unique identifier for the watch to find.
+ *
+ * \return A pointer to the found watch structure or NULL.
+ */
+static struct xs_watch *
+find_watch(const char *token)
+{
+ struct xs_watch *i, *cmp;
+
+ cmp = (void *)strtoul(token, NULL, 16);
+
+ LIST_FOREACH(i, &xs.registered_watches, list)
+ if (i == cmp)
+ return (i);
+
+ return (NULL);
+}
+
+/**
+ * Thread body of the XenStore watch event dispatch thread.
+ */
+static void
+xenwatch_thread(void *unused)
+{
+ struct xs_stored_msg *msg;
+
+ for (;;) {
+
+ mtx_lock(&xs.watch_events_lock);
+ while (TAILQ_EMPTY(&xs.watch_events))
+ mtx_sleep(&xs.watch_events,
+ &xs.watch_events_lock,
+ PWAIT | PCATCH, "waitev", hz/10);
+
+ mtx_unlock(&xs.watch_events_lock);
+ sx_xlock(&xs.xenwatch_mutex);
+
+ mtx_lock(&xs.watch_events_lock);
+ msg = TAILQ_FIRST(&xs.watch_events);
+ if (msg)
+ TAILQ_REMOVE(&xs.watch_events, msg, list);
+ mtx_unlock(&xs.watch_events_lock);
+
+ if (msg != NULL) {
+ /*
+ * XXX There are messages coming in with a NULL
+ * XXX callback. This deserves further investigation;
+ * XXX the workaround here simply prevents the kernel
+ * XXX from panic'ing on startup.
+ */
+ if (msg->u.watch.handle->callback != NULL)
+ msg->u.watch.handle->callback(
+ msg->u.watch.handle,
+ (const char **)msg->u.watch.vec,
+ msg->u.watch.vec_size);
+ free(msg->u.watch.vec, M_XENSTORE);
+ free(msg, M_XENSTORE);
+ }
+
+ sx_xunlock(&xs.xenwatch_mutex);
+ }
+}
+
+/*----------- XenStore Configuration, Initialization, and Control ------------*/
+/**
+ * Setup communication channels with the XenStore service.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+static int
+xs_init_comms(void)
+{
+ int error;
+
+ if (xen_store->rsp_prod != xen_store->rsp_cons) {
+ log(LOG_WARNING, "XENSTORE response ring is not quiescent "
+ "(%08x:%08x): fixing up\n",
+ xen_store->rsp_cons, xen_store->rsp_prod);
+ xen_store->rsp_cons = xen_store->rsp_prod;
+ }
+
+ if (xs.irq)
+ unbind_from_irqhandler(xs.irq);
+
+ error = bind_caller_port_to_irqhandler(xs.evtchn, "xenstore",
+ xs_intr, NULL, INTR_TYPE_NET, &xs.irq);
+ if (error) {
+ log(LOG_WARNING, "XENSTORE request irq failed %i\n", error);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*------------------ Private Device Attachment Functions --------------------*/
+static void
+xs_identify(driver_t *driver, device_t parent)
+{
+
+ BUS_ADD_CHILD(parent, 0, "xenstore", 0);
+}
+
+/**
+ * Probe for the existance of the XenStore.
+ *
+ * \param dev
+ */
+static int
+xs_probe(device_t dev)
+{
+ /*
+ * We are either operating within a PV kernel or being probed
+ * as the child of the successfully attached xenpci device.
+ * Thus we are in a Xen environment and there will be a XenStore.
+ * Uncontitionally return success.
+ */
+ device_set_desc(dev, "XenStore");
+printf("xs_probe: Probe retuns 0\n");
+ return (0);
+}
+
+static void
+xs_attach_deferred(void *arg)
+{
+ xs_dev_init();
+
+ bus_generic_probe(xs.xs_dev);
+ bus_generic_attach(xs.xs_dev);
+
+ config_intrhook_disestablish(&xs.xs_attachcb);
+}
+
+/**
+ * Attach to the XenStore.
+ *
+ * This routine also prepares for the probe/attach of drivers that rely
+ * on the XenStore.
+ */
+static int
+xs_attach(device_t dev)
+{
+ int error;
+
+ /* Allow us to get device_t from softc and vice-versa. */
+ xs.xs_dev = dev;
+ device_set_softc(dev, &xs);
+
+ /*
+ * This seems to be a layering violation. The XenStore is just
+ * one of many clients of the Grant Table facility. It happens
+ * to be the first and a gating consumer to all other devices,
+ * so this does work. A better place would be in the PV support
+ * code for fully PV kernels and the xenpci driver for HVM kernels.
+ */
+ error = gnttab_init();
+ if (error != 0) {
+ log(LOG_WARNING,
+ "XENSTORE: Error initializing grant tables: %d\n", error);
+ return (ENXIO);
+ }
+
+ /* Initialize the interface to xenstore. */
+ struct proc *p;
+
+#ifdef XENHVM
+ xs.evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
+ xs.gpfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
+ xen_store = pmap_mapdev(xs.gpfn * PAGE_SIZE, PAGE_SIZE);
+#else
+ xs.evtchn = xen_start_info->store_evtchn;
+#endif
+
+ TAILQ_INIT(&xs.reply_list);
+ TAILQ_INIT(&xs.watch_events);
+
+ mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF);
+ mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF);
+ sx_init(&xs.xenwatch_mutex, "xenwatch");
+ sx_init(&xs.request_mutex, "xenstore request");
+ sx_init(&xs.suspend_mutex, "xenstore suspend");
+ mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF);
+ mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF);
+ xs.irq = 0;
+
+ /* Initialize the shared memory rings to talk to xenstored */
+ error = xs_init_comms();
+ if (error)
+ return (error);
+
+ error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID,
+ 0, "xenwatch");
+ if (error)
+ return (error);
+ xs.xenwatch_pid = p->p_pid;
+
+ error = kproc_create(xs_rcv_thread, NULL, NULL,
+ RFHIGHPID, 0, "xenstore_rcv");
+
+ xs.xs_attachcb.ich_func = xs_attach_deferred;
+ xs.xs_attachcb.ich_arg = NULL;
+ config_intrhook_establish(&xs.xs_attachcb);
+
+ return (error);
+}
+
+/**
+ * Prepare for suspension of this VM by halting XenStore access after
+ * all transactions and individual requests have completed.
+ */
+static int
+xs_suspend(device_t dev __unused)
+{
+
+ sx_xlock(&xs.suspend_mutex);
+ sx_xlock(&xs.request_mutex);
+
+ return (0);
+}
+
+/**
+ * Resume XenStore operations after this VM is resumed.
+ */
+static int
+xs_resume(device_t dev __unused)
+{
+ struct xs_watch *watch;
+ char token[sizeof(watch) * 2 + 1];
+
+ xs_init_comms();
+
+ sx_xunlock(&xs.request_mutex);
+
+ /*
+ * No need for registered_watches_lock: the suspend_mutex
+ * is sufficient.
+ */
+ LIST_FOREACH(watch, &xs.registered_watches, list) {
+ sprintf(token, "%lX", (long)watch);
+ xs_watch(watch->node, token);
+ }
+
+ sx_xunlock(&xs.suspend_mutex);
+
+ return (0);
+}
+
+/*-------------------- Private Device Attachment Data -----------------------*/
+static device_method_t xenstore_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, xs_identify),
+ DEVMETHOD(device_probe, xs_probe),
+ DEVMETHOD(device_attach, xs_attach),
+ DEVMETHOD(device_detach, bus_generic_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, xs_suspend),
+ DEVMETHOD(device_resume, xs_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_add_child, bus_generic_add_child),
+ DEVMETHOD(bus_print_child, bus_generic_print_child),
+ DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource),
+ DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+
+ { 0, 0 }
+};
+
+DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0);
+static devclass_t xenstore_devclass;
+
+#ifdef XENHVM
+DRIVER_MODULE(xenstore, xenpci, xenstore_driver, xenstore_devclass, 0, 0);
+#else
+DRIVER_MODULE(xenstore, nexus, xenstore_driver, xenstore_devclass, 0, 0);
+#endif
+
+/*------------------------------- Sysctl Data --------------------------------*/
+/* XXX Shouldn't the node be somewhere else? */
+SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
+SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, "");
+SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
+
+/*-------------------------------- Public API --------------------------------*/
+/*------- API comments for these methods can be found in xenstorevar.h -------*/
+int
+xs_directory(struct xs_transaction t, const char *dir, const char *node,
+ u_int *num, const char ***result)
+{
+ struct sbuf *path;
+ char *strings;
+ u_int len = 0;
+ int error;
+
+ path = xs_join(dir, node);
+ error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len,
+ (void **)&strings);
+ sbuf_delete(path);
+ if (error)
+ return (error);
+
+ *result = split(strings, len, num);
+
+ return (0);
+}
+
+int
+xs_exists(struct xs_transaction t, const char *dir, const char *node)
+{
+ const char **d;
+ int error, dir_n;
+
+ error = xs_directory(t, dir, node, &dir_n, &d);
+ if (error)
+ return (0);
+ free(d, M_XENSTORE);
+ return (1);
+}
+
+int
+xs_read(struct xs_transaction t, const char *dir, const char *node,
+ u_int *len, void **result)
+{
+ struct sbuf *path;
+ void *ret;
+ int error;
+
+ path = xs_join(dir, node);
+ error = xs_single(t, XS_READ, sbuf_data(path), len, &ret);
+ sbuf_delete(path);
+ if (error)
+ return (error);
+ *result = ret;
+ return (0);
+}
+
+int
+xs_write(struct xs_transaction t, const char *dir, const char *node,
+ const char *string)
+{
+ struct sbuf *path;
+ struct iovec iovec[2];
+ int error;
+
+ path = xs_join(dir, node);
+
+ iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path);
+ iovec[0].iov_len = sbuf_len(path) + 1;
+ iovec[1].iov_base = (void *)(uintptr_t) string;
+ iovec[1].iov_len = strlen(string);
+
+ error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
+ sbuf_delete(path);
+
+ return (error);
+}
+
+int
+xs_mkdir(struct xs_transaction t, const char *dir, const char *node)
+{
+ struct sbuf *path;
+ int ret;
+
+ path = xs_join(dir, node);
+ ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL);
+ sbuf_delete(path);
+
+ return (ret);
+}
+
+int
+xs_rm(struct xs_transaction t, const char *dir, const char *node)
+{
+ struct sbuf *path;
+ int ret;
+
+ path = xs_join(dir, node);
+ ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL);
+ sbuf_delete(path);
+
+ return (ret);
+}
+
+int
+xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node)
+{
+ struct xs_transaction local_xbt;
+ struct sbuf *root_path_sbuf;
+ struct sbuf *cur_path_sbuf;
+ char *root_path;
+ char *cur_path;
+ const char **dir;
+ int error;
+ int empty;
+
+retry:
+ root_path_sbuf = xs_join(base, node);
+ cur_path_sbuf = xs_join(base, node);
+ root_path = sbuf_data(root_path_sbuf);
+ cur_path = sbuf_data(cur_path_sbuf);
+ dir = NULL;
+ local_xbt.id = 0;
+
+ if (xbt.id == 0) {
+ error = xs_transaction_start(&local_xbt);
+ if (error != 0)
+ goto out;
+ xbt = local_xbt;
+ }
+
+ empty = 0;
+ while (1) {
+ u_int count;
+ u_int i;
+
+ error = xs_directory(xbt, cur_path, "", &count, &dir);
+ if (error)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ error = xs_rm(xbt, cur_path, dir[i]);
+ if (error == ENOTEMPTY) {
+ struct sbuf *push_dir;
+
+ /*
+ * Descend to clear out this sub directory.
+ * We'll return to cur_dir once push_dir
+ * is empty.
+ */
+ push_dir = xs_join(cur_path, dir[i]);
+ sbuf_delete(cur_path_sbuf);
+ cur_path_sbuf = push_dir;
+ cur_path = sbuf_data(cur_path_sbuf);
+ break;
+ } else if (error != 0) {
+ goto out;
+ }
+ }
+
+ free(dir, M_XENSTORE);
+ dir = NULL;
+
+ if (i == count) {
+ char *last_slash;
+
+ /* Directory is empty. It is now safe to remove. */
+ error = xs_rm(xbt, cur_path, "");
+ if (error != 0)
+ goto out;
+
+ if (!strcmp(cur_path, root_path))
+ break;
+
+ /* Return to processing the parent directory. */
+ last_slash = strrchr(cur_path, '/');
+ KASSERT(last_slash != NULL,
+ ("xs_rm_tree: mangled path %s", cur_path));
+ *last_slash = '\0';
+ }
+ }
+
+out:
+ sbuf_delete(cur_path_sbuf);
+ sbuf_delete(root_path_sbuf);
+ if (dir != NULL)
+ free(dir, M_XENSTORE);
+
+ if (local_xbt.id != 0) {
+ int terror;
+
+ terror = xs_transaction_end(local_xbt, /*abort*/error != 0);
+ xbt.id = 0;
+ if (terror == EAGAIN && error == 0)
+ goto retry;
+ }
+ return (error);
+}
+
+int
+xs_transaction_start(struct xs_transaction *t)
+{
+ char *id_str;
+ int error;
+
+ error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL,
+ (void **)&id_str);
+ if (error == 0) {
+ t->id = strtoul(id_str, NULL, 0);
+ free(id_str, M_XENSTORE);
+ }
+ return (error);
+}
+
+int
+xs_transaction_end(struct xs_transaction t, int abort)
+{
+ char abortstr[2];
+
+ if (abort)
+ strcpy(abortstr, "F");
+ else
+ strcpy(abortstr, "T");
+
+ return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL));
+}
+
+int
+xs_scanf(struct xs_transaction t, const char *dir, const char *node,
+ int *scancountp, const char *fmt, ...)
+{
+ va_list ap;
+ int error, ns;
+ char *val;
+
+ error = xs_read(t, dir, node, NULL, (void **) &val);
+ if (error)
+ return (error);
+
+ va_start(ap, fmt);
+ ns = vsscanf(val, fmt, ap);
+ va_end(ap);
+ free(val, M_XENSTORE);
+ /* Distinctive errno. */
+ if (ns == 0)
+ return (ERANGE);
+ if (scancountp)
+ *scancountp = ns;
+ return (0);
+}
+
+int
+xs_vprintf(struct xs_transaction t,
+ const char *dir, const char *node, const char *fmt, va_list ap)
+{
+ struct sbuf *sb;
+ int error;
+
+ sb = sbuf_new_auto();
+ sbuf_vprintf(sb, fmt, ap);
+ sbuf_finish(sb);
+ error = xs_write(t, dir, node, sbuf_data(sb));
+ sbuf_delete(sb);
+
+ return (error);
+}
+
+int
+xs_printf(struct xs_transaction t, const char *dir, const char *node,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int error;
+
+ va_start(ap, fmt);
+ error = xs_vprintf(t, dir, node, fmt, ap);
+ va_end(ap);
+
+ return (error);
+}
+
+int
+xs_gather(struct xs_transaction t, const char *dir, ...)
+{
+ va_list ap;
+ const char *name;
+ int error;
+
+ va_start(ap, dir);
+ error = 0;
+ while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
+ const char *fmt = va_arg(ap, char *);
+ void *result = va_arg(ap, void *);
+ char *p;
+
+ error = xs_read(t, dir, name, NULL, (void **) &p);
+ if (error)
+ break;
+
+ if (fmt) {
+ if (sscanf(p, fmt, result) == 0)
+ error = EINVAL;
+ free(p, M_XENSTORE);
+ } else
+ *(char **)result = p;
+ }
+ va_end(ap);
+
+ return (error);
+}
+
+int
+xs_register_watch(struct xs_watch *watch)
+{
+ /* Pointer in ascii is the token. */
+ char token[sizeof(watch) * 2 + 1];
+ int error;
+
+ sprintf(token, "%lX", (long)watch);
+
+ sx_slock(&xs.suspend_mutex);
+
+ mtx_lock(&xs.registered_watches_lock);
+ KASSERT(find_watch(token) == NULL, ("watch already registered"));
+ LIST_INSERT_HEAD(&xs.registered_watches, watch, list);
+ mtx_unlock(&xs.registered_watches_lock);
+
+ error = xs_watch(watch->node, token);
+
+ /* Ignore errors due to multiple registration. */
+ if (error == EEXIST)
+ error = 0;
+
+ if (error != 0) {
+ mtx_lock(&xs.registered_watches_lock);
+ LIST_REMOVE(watch, list);
+ mtx_unlock(&xs.registered_watches_lock);
+ }
+
+ sx_sunlock(&xs.suspend_mutex);
+
+ return (error);
+}
+
+void
+xs_unregister_watch(struct xs_watch *watch)
+{
+ struct xs_stored_msg *msg, *tmp;
+ char token[sizeof(watch) * 2 + 1];
+ int error;
+
+ sprintf(token, "%lX", (long)watch);
+
+ sx_slock(&xs.suspend_mutex);
+
+ mtx_lock(&xs.registered_watches_lock);
+ if (find_watch(token) == NULL) {
+ mtx_unlock(&xs.registered_watches_lock);
+ sx_sunlock(&xs.suspend_mutex);
+ return;
+ }
+ LIST_REMOVE(watch, list);
+ mtx_unlock(&xs.registered_watches_lock);
+
+ error = xs_unwatch(watch->node, token);
+ if (error)
+ log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n",
+ watch->node, error);
+
+ sx_sunlock(&xs.suspend_mutex);
+
+ /* Cancel pending watch events. */
+ mtx_lock(&xs.watch_events_lock);
+ TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) {
+ if (msg->u.watch.handle != watch)
+ continue;
+ TAILQ_REMOVE(&xs.watch_events, msg, list);
+ free(msg->u.watch.vec, M_XENSTORE);
+ free(msg, M_XENSTORE);
+ }
+ mtx_unlock(&xs.watch_events_lock);
+
+ /* Flush any currently-executing callback, unless we are it. :-) */
+ if (curproc->p_pid != xs.xenwatch_pid) {
+ sx_xlock(&xs.xenwatch_mutex);
+ sx_xunlock(&xs.xenwatch_mutex);
+ }
+}
diff --git a/sys/xen/xenbus/xenbus_dev.c b/sys/xen/xenstore/xenstore_dev.c
index ac3f103..1fa4197 100644
--- a/sys/xen/xenbus/xenbus_dev.c
+++ b/sys/xen/xenstore/xenstore_dev.c
@@ -1,8 +1,8 @@
/*
- * xenbus_dev.c
+ * xenstore_dev.c
*
- * Driver giving user-space access to the kernel's xenbus connection
- * to xenstore.
+ * Driver giving user-space access to the kernel's connection to the
+ * XenStore service.
*
* Copyright (c) 2005, Christian Limpach
* Copyright (c) 2005, Rusty Russell, IBM Corporation
@@ -45,18 +45,19 @@ __FBSDID("$FreeBSD$");
#include <sys/conf.h>
#include <machine/xen/xen-os.h>
+
#include <xen/hypervisor.h>
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenstore/xenstore_internal.h>
-struct xenbus_dev_transaction {
- LIST_ENTRY(xenbus_dev_transaction) list;
- struct xenbus_transaction handle;
+struct xs_dev_transaction {
+ LIST_ENTRY(xs_dev_transaction) list;
+ struct xs_transaction handle;
};
-struct xenbus_dev_data {
+struct xs_dev_data {
/* In-progress transaction. */
- LIST_HEAD(xdd_list_head, xenbus_dev_transaction) transactions;
+ LIST_HEAD(xdd_list_head, xs_dev_transaction) transactions;
/* Partial request. */
unsigned int len;
@@ -72,13 +73,13 @@ struct xenbus_dev_data {
};
static int
-xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
+xs_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
{
int error;
- struct xenbus_dev_data *u = dev->si_drv1;
+ struct xs_dev_data *u = dev->si_drv1;
while (u->read_prod == u->read_cons) {
- error = tsleep(u, PCATCH, "xbdread", hz/10);
+ error = tsleep(u, PCATCH, "xsdread", hz/10);
if (error && error != EWOULDBLOCK)
return (error);
}
@@ -96,7 +97,7 @@ xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
}
static void
-queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len)
+xs_queue_reply(struct xs_dev_data *u, char *data, unsigned int len)
{
int i;
@@ -110,11 +111,11 @@ queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len)
}
static int
-xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
+xs_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
{
int error;
- struct xenbus_dev_data *u = dev->si_drv1;
- struct xenbus_dev_transaction *trans;
+ struct xs_dev_data *u = dev->si_drv1;
+ struct xs_dev_transaction *trans;
void *reply;
int len = uio->uio_resid;
@@ -141,10 +142,10 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
case XS_MKDIR:
case XS_RM:
case XS_SET_PERMS:
- error = xenbus_dev_request_and_reply(&u->u.msg, &reply);
+ error = xs_dev_request_and_reply(&u->u.msg, &reply);
if (!error) {
if (u->u.msg.type == XS_TRANSACTION_START) {
- trans = malloc(sizeof(*trans), M_DEVBUF,
+ trans = malloc(sizeof(*trans), M_XENSTORE,
M_WAITOK);
trans->handle.id = strtoul(reply, NULL, 0);
LIST_INSERT_HEAD(&u->transactions, trans, list);
@@ -156,11 +157,11 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
BUG_ON(&trans->list == &u->transactions);
#endif
LIST_REMOVE(trans, list);
- free(trans, M_DEVBUF);
+ free(trans, M_XENSTORE);
}
- queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
- queue_reply(u, (char *)reply, u->u.msg.len);
- free(reply, M_DEVBUF);
+ xs_queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
+ xs_queue_reply(u, (char *)reply, u->u.msg.len);
+ free(reply, M_XENSTORE);
}
break;
@@ -176,16 +177,14 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
}
static int
-xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+xs_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
- struct xenbus_dev_data *u;
+ struct xs_dev_data *u;
- if (xen_store_evtchn == 0)
- return (ENOENT);
#if 0 /* XXX figure out if equiv needed */
nonseekable_open(inode, filp);
#endif
- u = malloc(sizeof(*u), M_DEVBUF, M_WAITOK|M_ZERO);
+ u = malloc(sizeof(*u), M_XENSTORE, M_WAITOK|M_ZERO);
LIST_INIT(&u->transactions);
dev->si_drv1 = u;
@@ -193,37 +192,33 @@ xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
}
static int
-xenbus_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+xs_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
- struct xenbus_dev_data *u = dev->si_drv1;
- struct xenbus_dev_transaction *trans, *tmp;
+ struct xs_dev_data *u = dev->si_drv1;
+ struct xs_dev_transaction *trans, *tmp;
LIST_FOREACH_SAFE(trans, &u->transactions, list, tmp) {
- xenbus_transaction_end(trans->handle, 1);
+ xs_transaction_end(trans->handle, 1);
LIST_REMOVE(trans, list);
- free(trans, M_DEVBUF);
+ free(trans, M_XENSTORE);
}
- free(u, M_DEVBUF);
+ free(u, M_XENSTORE);
return (0);
}
-static struct cdevsw xenbus_dev_cdevsw = {
+static struct cdevsw xs_dev_cdevsw = {
.d_version = D_VERSION,
- .d_read = xenbus_dev_read,
- .d_write = xenbus_dev_write,
- .d_open = xenbus_dev_open,
- .d_close = xenbus_dev_close,
- .d_name = "xenbus_dev",
+ .d_read = xs_dev_read,
+ .d_write = xs_dev_write,
+ .d_open = xs_dev_open,
+ .d_close = xs_dev_close,
+ .d_name = "xs_dev",
};
-static int
-xenbus_dev_sysinit(void)
+void
+xs_dev_init()
{
- make_dev(&xenbus_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400,
- "xen/xenbus");
-
- return (0);
+ make_dev(&xs_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400,
+ "xen/xenstore");
}
-SYSINIT(xenbus_dev_sysinit, SI_SUB_DRIVERS, SI_ORDER_MIDDLE,
- xenbus_dev_sysinit, NULL);
diff --git a/sys/xen/xenstore/xenstore_internal.h b/sys/xen/xenstore/xenstore_internal.h
new file mode 100644
index 0000000..0398aef
--- /dev/null
+++ b/sys/xen/xenstore/xenstore_internal.h
@@ -0,0 +1,39 @@
+/*-
+ * Core definitions and data structures shareable across OS platforms.
+ *
+ * Copyright (c) 2010 Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * substantially similar to the "NO WARRANTY" disclaimer below
+ * ("Disclaimer") and any redistribution must be conditioned upon
+ * including a substantially similar Disclaimer requirement for further
+ * binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+
+/* Initialize support for userspace access to the XenStore. */
+void xs_dev_init(void);
+
+/* Used by the XenStore character device to borrow kernel's store connection. */
+int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result);
diff --git a/sys/xen/xenstore/xenstorevar.h b/sys/xen/xenstore/xenstorevar.h
new file mode 100644
index 0000000..df41e31
--- /dev/null
+++ b/sys/xen/xenstore/xenstorevar.h
@@ -0,0 +1,338 @@
+/******************************************************************************
+ * xenstorevar.h
+ *
+ * Method declarations and structures for accessing the XenStore.h
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ * Copyright (C) 2009,2010 Spectra Logic Corporation
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XEN_XENSTORE_XENSTOREVAR_H
+#define _XEN_XENSTORE_XENSTOREVAR_H
+
+#include <sys/queue.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen/xen-os.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/xenbus.h>
+#include <xen/interface/io/xs_wire.h>
+
+#include "xenbus_if.h"
+
+/* XenStore allocations including XenStore data returned to clients. */
+MALLOC_DECLARE(M_XENSTORE);
+
+struct xenstore_domain_interface;
+struct xs_watch;
+extern struct xenstore_domain_interface *xen_store;
+
+typedef void (xs_watch_cb_t)(struct xs_watch *,
+ const char **vec, unsigned int len);
+
+/* Register callback to watch subtree (node) in the XenStore. */
+struct xs_watch
+{
+ LIST_ENTRY(xs_watch) list;
+
+ /* Path being watched. */
+ char *node;
+
+ /* Callback (executed in a process context with no locks held). */
+ xs_watch_cb_t *callback;
+};
+LIST_HEAD(xs_watch_list, xs_watch);
+
+typedef int (*xs_event_handler_t)(void *);
+
+struct xs_transaction
+{
+ uint32_t id;
+};
+
+#define XST_NIL ((struct xs_transaction) { 0 })
+
+/**
+ * Fetch the contents of a directory in the XenStore.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the path to read.
+ * \param node The basename of the path to read.
+ * \param num The returned number of directory entries.
+ * \param result An array of directory entry strings.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * \note The results buffer is malloced and should be free'd by the
+ * caller with 'free(*result, M_XENSTORE)'.
+ */
+int xs_directory(struct xs_transaction t, const char *dir,
+ const char *node, unsigned int *num, const char ***result);
+
+/**
+ * Determine if a path exists in the XenStore.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the path to read.
+ * \param node The basename of the path to read.
+ *
+ * \retval 1 The path exists.
+ * \retval 0 The path does not exist or an error occurred attempting
+ * to make that determination.
+ */
+int xs_exists(struct xs_transaction t, const char *dir, const char *node);
+
+/**
+ * Get the contents of a single "file". Returns the contents in
+ * *result which should be freed with free(*result, M_XENSTORE) after
+ * use. The length of the value in bytes is returned in *len.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the file to read.
+ * \param node The basename of the file to read.
+ * \param len The amount of data read.
+ * \param result The returned contents from this file.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ *
+ * \note The results buffer is malloced and should be free'd by the
+ * caller with 'free(*result, M_XENSTORE)'.
+ */
+int xs_read(struct xs_transaction t, const char *dir,
+ const char *node, unsigned int *len, void **result);
+
+/**
+ * Write to a single file.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the file to write.
+ * \param node The basename of the file to write.
+ * \param string The NUL terminated string of data to write.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_write(struct xs_transaction t, const char *dir,
+ const char *node, const char *string);
+
+/**
+ * Create a new directory.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the directory to create.
+ * \param node The basename of the directory to create.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_mkdir(struct xs_transaction t, const char *dir,
+ const char *node);
+
+/**
+ * Remove a file or directory (directories must be empty).
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the directory to remove.
+ * \param node The basename of the directory to remove.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_rm(struct xs_transaction t, const char *dir, const char *node);
+
+/**
+ * Destroy a tree of files rooted at dir/node.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the directory to remove.
+ * \param node The basename of the directory to remove.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_rm_tree(struct xs_transaction t, const char *dir,
+ const char *node);
+
+/**
+ * Start a transaction.
+ *
+ * Changes by others will not be seen during the lifetime of this
+ * transaction, and changes will not be visible to others until it
+ * is committed (xs_transaction_end).
+ *
+ * \param t The returned transaction.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_transaction_start(struct xs_transaction *t);
+
+/**
+ * End a transaction.
+ *
+ * \param t The transaction to end/commit.
+ * \param abort If non-zero, the transaction is discarded
+ * instead of committed.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_transaction_end(struct xs_transaction t, int abort);
+
+/*
+ * Single file read and scanf parsing of the result.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the path to read.
+ * \param node The basename of the path to read.
+ * \param scancountp The number of input values assigned (i.e. the result
+ * of scanf).
+ * \param fmt Scanf format string followed by a variable number of
+ * scanf input arguments.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of failure.
+ */
+int xs_scanf(struct xs_transaction t,
+ const char *dir, const char *node, int *scancountp, const char *fmt, ...)
+ __attribute__((format(scanf, 5, 6)));
+
+/**
+ * Printf formatted write to a XenStore file.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the path to read.
+ * \param node The basename of the path to read.
+ * \param fmt Printf format string followed by a variable number of
+ * printf arguments.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of write failure.
+ */
+int xs_printf(struct xs_transaction t, const char *dir,
+ const char *node, const char *fmt, ...)
+ __attribute__((format(printf, 4, 5)));
+
+/**
+ * va_list version of xenbus_printf().
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the path to read.
+ * \param node The basename of the path to read.
+ * \param fmt Printf format string.
+ * \param ap Va_list of printf arguments.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of write failure.
+ */
+int xs_vprintf(struct xs_transaction t, const char *dir,
+ const char *node, const char *fmt, va_list ap);
+
+/**
+ * Multi-file read within a single directory and scanf parsing of
+ * the results.
+ *
+ * \param t The XenStore transaction covering this request.
+ * \param dir The dirname of the paths to read.
+ * \param ... A variable number of argument triples specifying
+ * the file name, scanf-style format string, and
+ * output variable (pointer to storage of the results).
+ * The last triple in the call must be terminated
+ * will a final NULL argument. A NULL format string
+ * will cause the entire contents of the given file
+ * to be assigned as a NUL terminated, M_XENSTORE heap
+ * backed, string to the output parameter of that tuple.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of read failure.
+ *
+ * Example:
+ * char protocol_abi[64];
+ * uint32_t ring_ref;
+ * char *dev_type;
+ * int error;
+ *
+ * error = xenbus_gather(XBT_NIL, xenbus_get_node(dev),
+ * "ring-ref", "%" PRIu32, &ring_ref,
+ * "protocol", "%63s", protocol_abi,
+ * "device-type", NULL, &dev_type,
+ * NULL);
+ *
+ * ...
+ *
+ * free(dev_type, M_XENSTORE);
+ */
+int xs_gather(struct xs_transaction t, const char *dir, ...);
+
+/**
+ * Register a XenStore watch.
+ *
+ * XenStore watches allow a client to be notified via a callback (embedded
+ * within the watch object) of changes to an object in the XenStore.
+ *
+ * \param watch A xenbus_watch struct with it's node and callback fields
+ * properly initialized.
+ *
+ * \return On success, 0. Otherwise an errno value indicating the
+ * type of write failure. EEXIST errors from the XenStore
+ * are supressed, allowing multiple, physically different,
+ * xenbus_watch objects, to watch the same path in the XenStore.
+ */
+int xs_register_watch(struct xs_watch *watch);
+
+/**
+ * Unregister a XenStore watch.
+ *
+ * \param watch An xs_watch object previously used in a successful call
+ * to xs_register_watch().
+ *
+ * The xs_watch object's node field is not altered by this call.
+ * It is the caller's responsibility to properly dispose of both the
+ * watch object and the data pointed to by watch->node.
+ */
+void xs_unregister_watch(struct xs_watch *watch);
+
+/**
+ * Allocate and return an sbuf containing the XenStore path string
+ * <dir>/<name>. If name is the NUL string, the returned sbuf contains
+ * the path string <dir>.
+ *
+ * \param dir The NUL terminated directory prefix for new path.
+ * \param name The NUL terminated basename for the new path.
+ *
+ * \return A buffer containing the joined path.
+ */
+struct sbuf *xs_join(const char *, const char *);
+
+#endif /* _XEN_XENSTORE_XENSTOREVAR_H */
OpenPOWER on IntegriCloud