diff options
Diffstat (limited to 'drivers/xen')
28 files changed, 3953 insertions, 554 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index cab100a..07bec09 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -1,6 +1,8 @@ +menu "Xen driver support" + depends on XEN + config XEN_BALLOON bool "Xen memory balloon driver" - depends on XEN default y help The balloon driver allows the Xen domain to request more memory from @@ -20,7 +22,6 @@ config XEN_SCRUB_PAGES config XEN_DEV_EVTCHN tristate "Xen /dev/xen/evtchn device" - depends on XEN default y help The evtchn driver allows a userspace process to triger event @@ -28,9 +29,16 @@ config XEN_DEV_EVTCHN firing. If in doubt, say yes. +config XEN_BACKEND + bool "Backend driver support" + depends on XEN_DOM0 + default y + help + Support for backend device drivers that provide I/O services + to other virtual machines. + config XENFS tristate "Xen filesystem" - depends on XEN default y help The xen filesystem provides a way for domains to share @@ -53,11 +61,38 @@ config XEN_COMPAT_XENFS config XEN_SYS_HYPERVISOR bool "Create xen entries under /sys/hypervisor" - depends on XEN && SYSFS + depends on SYSFS select SYS_HYPERVISOR default y help Create entries under /sys/hypervisor describing the Xen hypervisor environment. When running native or in another virtual environment, /sys/hypervisor will still be present, - but will have no xen contents.
\ No newline at end of file + but will have no xen contents. + +config XEN_XENBUS_FRONTEND + tristate + +config XEN_GNTDEV + tristate "userspace grant access device driver" + depends on XEN + select MMU_NOTIFIER + help + Allows userspace processes to use grants. + +config XEN_PLATFORM_PCI + tristate "xen platform pci device driver" + depends on XEN_PVHVM && PCI + default m + help + Driver for the Xen PCI Platform device: it is responsible for + initializing xenbus and grant_table when running in a Xen HVM + domain. As a consequence this driver is required to run any Xen PV + frontend on Xen HVM. + +config SWIOTLB_XEN + def_bool y + depends on PCI + select SWIOTLB + +endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ec2a39b..5088cc2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,9 +1,22 @@ obj-y += grant-table.o features.o events.o manage.o obj-y += xenbus/ +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_features.o := $(nostackp) + +obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XENFS) += xenfs/ -obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
\ No newline at end of file +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_DOM0) += pci.o + +xen-evtchn-y := evtchn.o +xen-gntdev-y := gntdev.o + +xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f5bbd9e..43f9f02 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,15 +43,19 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/sysdev.h> +#include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/tlb.h> +#include <asm/e820.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> + +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> #include <xen/xenbus.h> @@ -66,8 +70,6 @@ struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; - /* We may hit the hard limit in Xen. If we do then we remember it. */ - unsigned long hard_limit; /* * Drivers may alter the memory reservation independently, but they * must inform the balloon driver so we avoid hitting the hard limit. @@ -84,23 +86,12 @@ static struct sys_device balloon_sysdev; static int register_balloon(struct sys_device *sysdev); -/* - * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. - */ -static DEFINE_SPINLOCK(balloon_lock); - static struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; -/* VM /proc information for memory */ -extern unsigned long totalram_pages; - #ifdef CONFIG_HIGHMEM -extern unsigned long totalhigh_pages; #define inc_totalhigh_pages() (totalhigh_pages++) #define dec_totalhigh_pages() (totalhigh_pages--) #else @@ -129,7 +120,7 @@ static void scrub_page(struct page *page) } /* balloon_append: add the given page to the balloon. */ -static void balloon_append(struct page *page) +static void __balloon_append(struct page *page) { /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { @@ -142,6 +133,12 @@ static void balloon_append(struct page *page) } } +static void balloon_append(struct page *page) +{ + __balloon_append(page); + totalram_pages--; +} + /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ static struct page *balloon_retrieve(void) { @@ -160,6 +157,8 @@ static struct page *balloon_retrieve(void) else balloon_stats.balloon_low--; + totalram_pages++; + return page; } @@ -185,7 +184,7 @@ static void balloon_alarm(unsigned long unused) static unsigned long current_target(void) { - unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit); + unsigned long target = balloon_stats.target_pages; target = min(target, balloon_stats.current_pages + @@ -197,7 +196,7 @@ static unsigned long current_target(void) static int increase_reservation(unsigned long nr_pages) { - unsigned long pfn, i, flags; + unsigned long pfn, i; struct page *page; long rc; struct xen_memory_reservation reservation = { @@ -209,35 +208,20 @@ static int increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - spin_lock_irqsave(&balloon_lock, flags); - page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { BUG_ON(page == NULL); - frame_list[i] = page_to_pfn(page);; + frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < nr_pages) { - if (rc > 0) { - int ret; - - /* We hit the Xen hard limit: reprobe. */ - reservation.nr_extents = rc; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - BUG_ON(ret != rc); - } - if (rc >= 0) - balloon_stats.hard_limit = (balloon_stats.current_pages + rc - - balloon_stats.driver_pages); + if (rc < 0) goto out; - } - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rc; i++) { page = balloon_retrieve(); BUG_ON(page == NULL); @@ -263,18 +247,15 @@ static int increase_reservation(unsigned long nr_pages) __free_page(page); } - balloon_stats.current_pages += nr_pages; - totalram_pages = balloon_stats.current_pages; + balloon_stats.current_pages += rc; out: - spin_unlock_irqrestore(&balloon_lock, flags); - - return 0; + return rc < 0 ? rc : rc != nr_pages; } static int decrease_reservation(unsigned long nr_pages) { - unsigned long pfn, i, flags; + unsigned long pfn, i; struct page *page; int need_sleep = 0; int ret; @@ -312,8 +293,6 @@ static int decrease_reservation(unsigned long nr_pages) kmap_flush_unused(); flush_tlb_all(); - spin_lock_irqsave(&balloon_lock, flags); - /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); @@ -327,9 +306,6 @@ static int decrease_reservation(unsigned long nr_pages) BUG_ON(ret != nr_pages); balloon_stats.current_pages -= nr_pages; - totalram_pages = balloon_stats.current_pages; - - spin_unlock_irqrestore(&balloon_lock, flags); return need_sleep; } @@ -371,7 +347,6 @@ static void balloon_process(struct work_struct *work) static void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - balloon_stats.hard_limit = ~0UL; balloon_stats.target_pages = target; schedule_work(&balloon_worker); } @@ -417,7 +392,7 @@ static struct notifier_block xenstore_notifier; static int __init balloon_init(void) { - unsigned long pfn; + unsigned long pfn, extra_pfn_end; struct page *page; if (!xen_pv_domain()) @@ -426,12 +401,10 @@ static int __init balloon_init(void) pr_info("xen_balloon: Initialising balloon driver.\n"); balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); - totalram_pages = balloon_stats.current_pages; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; balloon_stats.driver_pages = 0UL; - balloon_stats.hard_limit = ~0UL; init_timer(&balloon_timer); balloon_timer.data = 0; @@ -439,11 +412,24 @@ static int __init balloon_init(void) register_balloon(&balloon_sysdev); - /* Initialise the balloon with excess memory space. */ - for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + /* + * Initialise the balloon with excess memory space. We need + * to make sure we don't add memory which doesn't exist or + * logically exist. The E820 map can be trimmed to be smaller + * than the amount of physical memory due to the mem= command + * line parameter. And if this is a 32-bit non-HIGHMEM kernel + * on a system with memory which requires highmem to access, + * don't try to use it. + */ + extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()), + (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size)); + for (pfn = PFN_UP(xen_extra_mem_start); + pfn < extra_pfn_end; + pfn++) { page = pfn_to_page(pfn); - if (!PageReserved(page)) - balloon_append(page); + /* totalram_pages doesn't include the boot-time + balloon extension, so don't subtract from it. */ + __balloon_append(page); } target_watch.callback = watch_target; @@ -476,9 +462,6 @@ module_exit(balloon_exit); BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(hard_limit_kb, - (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n", - (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0); BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, @@ -548,7 +531,6 @@ static struct attribute *balloon_info_attrs[] = { &attr_current_kb.attr, &attr_low_kb.attr, &attr_high_kb.attr, - &attr_hard_limit_kb.attr, &attr_driver_kb.attr, NULL }; diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c new file mode 100644 index 0000000..ba6eda4 --- /dev/null +++ b/drivers/xen/biomerge.c @@ -0,0 +1,13 @@ +#include <linux/bio.h> +#include <linux/io.h> +#include <xen/page.h> + +bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2) +{ + unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); + unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); + + return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && + ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); +} diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index bdfd584..14e2d99 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,5 +1,6 @@ #include <linux/notifier.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <asm/xen/hypervisor.h> @@ -86,7 +87,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier, for_each_possible_cpu(cpu) { if (vcpu_online(cpu) == 0) { (void)cpu_down(cpu); - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); } } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index abad71b..7468147 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -16,7 +16,7 @@ * (typically dom0). * 2. VIRQs, typically used for timers. These are per-cpu events. * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. + * 4. PIRQs - Hardware interrupts. * * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ @@ -27,18 +27,28 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/irqnr.h> +#include <linux/pci.h> +#include <asm/desc.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/idle.h> +#include <asm/io_apic.h> #include <asm/sync_bitops.h> +#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/xen.h> +#include <xen/hvm.h> #include <xen/xen-ops.h> #include <xen/events.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> /* * This lock protects updates to the following mapping and reference-count @@ -47,10 +57,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { @@ -67,7 +77,8 @@ enum xen_irq_type { * event channel - irq->event channel mapping * cpu - cpu this event channel is bound to * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO" + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. * VIRQ - virq number * IPI - IPI vector * EVTCHN - @@ -82,21 +93,29 @@ struct irq_info unsigned short virq; enum ipi_vector ipi; struct { + unsigned short pirq; unsigned short gsi; - unsigned short vector; + unsigned char vector; + unsigned char flags; } pirq; } u; }; +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) -static struct irq_info irq_info[NR_IRQS]; +static struct irq_info *irq_info; +static int *pirq_to_irq; -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; +static int *evtchn_to_irq; struct cpu_evtchn_s { unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; }; -static struct cpu_evtchn_s *cpu_evtchn_mask_p; + +static __initdata struct cpu_evtchn_s init_evtchn_mask = { + .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, +}; +static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; + static inline unsigned long *cpu_evtchn_mask(int cpu) { return cpu_evtchn_mask_p[cpu].bits; @@ -106,6 +125,8 @@ static inline unsigned long *cpu_evtchn_mask(int cpu) #define VALID_EVTCHN(chn) ((chn) != 0) static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_percpu_chip; +static struct irq_chip xen_pirq_chip; /* Constructor for packed IRQ information. */ static struct irq_info mk_unbound_info(void) @@ -131,11 +152,12 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) .cpu = 0, .u.virq = virq }; } -static struct irq_info mk_pirq_info(unsigned short evtchn, +static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq, unsigned short gsi, unsigned short vector) { return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, - .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } }; + .cpu = 0, + .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } }; } /* @@ -148,6 +170,9 @@ static struct irq_info *info_for_irq(unsigned irq) static unsigned int evtchn_from_irq(unsigned irq) { + if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) + return 0; + return info_for_irq(irq)->evtchn; } @@ -177,6 +202,16 @@ static unsigned virq_from_irq(unsigned irq) return info->u.virq; } +static unsigned pirq_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.pirq; +} + static unsigned gsi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -218,6 +253,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) return ret; } +static bool pirq_needs_eoi(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.flags & PIRQ_NEEDS_EOI; +} + static inline unsigned long active_evtchns(unsigned int cpu, struct shared_info *sh, unsigned int idx) @@ -236,17 +280,17 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); #endif - __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); - __set_bit(chn, cpu_evtchn_mask(cpu)); + clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); + set_bit(chn, cpu_evtchn_mask(cpu)); irq_info[irq].cpu = cpu; } static void init_evtchn_cpu_bindings(void) { + int i; #ifdef CONFIG_SMP struct irq_desc *desc; - int i; /* By default all event channels notify CPU#0. */ for_each_irq_desc(i, desc) { @@ -254,7 +298,10 @@ static void init_evtchn_cpu_bindings(void) } #endif - memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0))); + for_each_possible_cpu(i) + memset(cpu_evtchn_mask(i), + (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s)); + } static inline void clear_evtchn(int port) @@ -311,7 +358,7 @@ static void unmask_evtchn(int port) struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); sync_clear_bit(port, &s->evtchn_mask[0]); @@ -329,27 +376,435 @@ static void unmask_evtchn(int port) put_cpu(); } -static int find_unbound_irq(void) +static int get_nr_hw_irqs(void) { - int irq; - struct irq_desc *desc; + int ret = 1; - for (irq = 0; irq < nr_irqs; irq++) - if (irq_info[irq].type == IRQT_UNBOUND) +#ifdef CONFIG_X86_IO_APIC + ret = get_nr_irqs_gsi(); +#endif + + return ret; +} + +static int find_unbound_pirq(int type) +{ + int rc, i; + struct physdev_get_free_pirq op_get_free_pirq; + op_get_free_pirq.type = type; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); + if (!rc) + return op_get_free_pirq.pirq; + + for (i = 0; i < nr_irqs; i++) { + if (pirq_to_irq[i] < 0) + return i; + } + return -1; +} + +static int find_unbound_irq(void) +{ + struct irq_data *data; + int irq, res; + int bottom = get_nr_hw_irqs(); + int top = nr_irqs-1; + + if (bottom == nr_irqs) + goto no_irqs; + + /* This loop starts from the top of IRQ space and goes down. + * We need this b/c if we have a PCI device in a Xen PV guest + * we do not have an IO-APIC (though the backend might have them) + * mapped in. To not have a collision of physical IRQs with the Xen + * event channels start at the top of the IRQ space for virtual IRQs. + */ + for (irq = top; irq > bottom; irq--) { + data = irq_get_irq_data(irq); + /* only 15->0 have init'd desc; handle irq > 16 */ + if (!data) + break; + if (data->chip == &no_irq_chip) break; + if (data->chip != &xen_dynamic_chip) + continue; + if (irq_info[irq].type == IRQT_UNBOUND) + return irq; + } + + if (irq == bottom) + goto no_irqs; - if (irq == nr_irqs) - panic("No available IRQ to bind to: increase nr_irqs!\n"); + res = irq_alloc_desc_at(irq, -1); - desc = irq_to_desc_alloc_node(irq, 0); - if (WARN_ON(desc == NULL)) + if (WARN_ON(res != irq)) return -1; - dynamic_irq_init(irq); + return irq; + +no_irqs: + panic("No available IRQ to bind to: increase nr_irqs!\n"); +} + +static bool identity_mapped_irq(unsigned irq) +{ + /* identity map all the hardware irqs */ + return irq < get_nr_hw_irqs(); +} + +static void pirq_unmask_notify(int irq) +{ + struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; + + if (unlikely(pirq_needs_eoi(irq))) { + int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void pirq_query_unmask(int irq) +{ + struct physdev_irq_status_query irq_status; + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + irq_status.irq = pirq_from_irq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + + info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; + if (irq_status.flags & XENIRQSTAT_needs_eoi) + info->u.pirq.flags |= PIRQ_NEEDS_EOI; +} + +static bool probing_irq(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc && desc->action == NULL; +} + +static unsigned int startup_pirq(unsigned int irq) +{ + struct evtchn_bind_pirq bind_pirq; + struct irq_info *info = info_for_irq(irq); + int evtchn = evtchn_from_irq(irq); + int rc; + + BUG_ON(info->type != IRQT_PIRQ); + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = pirq_from_irq(irq); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? + BIND_PIRQ__WILL_SHARE : 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); + if (rc != 0) { + if (!probing_irq(irq)) + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", + irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(irq); + + evtchn_to_irq[evtchn] = irq; + bind_evtchn_to_cpu(evtchn, 0); + info->evtchn = evtchn; + +out: + unmask_evtchn(evtchn); + pirq_unmask_notify(irq); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + struct evtchn_close close; + struct irq_info *info = info_for_irq(irq); + int evtchn = evtchn_from_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + if (!VALID_EVTCHN(evtchn)) + return; + + mask_evtchn(evtchn); + + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + info->evtchn = 0; +} + +static void enable_pirq(unsigned int irq) +{ + startup_pirq(irq); +} + +static void disable_pirq(unsigned int irq) +{ +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_desc *desc = irq_to_desc(irq); + + if (WARN_ON(!desc)) + return; + + if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == + (IRQ_DISABLED|IRQ_PENDING)) { + shutdown_pirq(irq); + } else if (VALID_EVTCHN(evtchn)) { + unmask_evtchn(evtchn); + pirq_unmask_notify(irq); + } +} + +static int find_irq_by_gsi(unsigned gsi) +{ + int irq; + + for (irq = 0; irq < nr_irqs; irq++) { + struct irq_info *info = info_for_irq(irq); + + if (info == NULL || info->type != IRQT_PIRQ) + continue; + + if (gsi_from_irq(irq) == gsi) + return irq; + } + + return -1; +} + +int xen_allocate_pirq(unsigned gsi, int shareable, char *name) +{ + return xen_map_pirq_gsi(gsi, gsi, shareable, name); +} + +/* xen_map_pirq_gsi might allocate irqs from the top down, as a + * consequence don't assume that the irq number returned has a low value + * or can be used as a pirq number unless you know otherwise. + * + * One notable exception is when xen_map_pirq_gsi is called passing an + * hardware gsi as argument, in that case the irq number returned + * matches the gsi number passed as second argument. + * + * Note: We don't assign an event channel until the irq actually started + * up. Return an existing irq if we've already got one for the gsi. + */ +int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) +{ + int irq = 0; + struct physdev_irq irq_op; + + spin_lock(&irq_mapping_update_lock); + + if ((pirq > nr_irqs) || (gsi > nr_irqs)) { + printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n", + pirq > nr_irqs ? "pirq" :"", + gsi > nr_irqs ? "gsi" : ""); + goto out; + } + + irq = find_irq_by_gsi(gsi); + if (irq != -1) { + printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", + irq, gsi); + goto out; /* XXX need refcount? */ + } + + /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore + * we are using the !xen_initial_domain() to drop in the function.*/ + if (identity_mapped_irq(gsi) || (!xen_initial_domain() && + xen_pv_domain())) { + irq = gsi; + irq_alloc_desc_at(irq, -1); + } else + irq = find_unbound_irq(); + + set_irq_chip_and_handler_name(irq, &xen_pirq_chip, + handle_level_irq, name); + + irq_op.irq = irq; + irq_op.vector = 0; + + /* Only the privileged domain can do this. For non-priv, the pcifront + * driver provides a PCI bus that does the call to do exactly + * this in the priv domain. */ + if (xen_initial_domain() && + HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + irq_free_desc(irq); + irq = -ENOSPC; + goto out; + } + + irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector); + irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; + pirq_to_irq[pirq] = irq; + +out: + spin_unlock(&irq_mapping_update_lock); return irq; } +#ifdef CONFIG_PCI_MSI +#include <linux/msi.h> +#include "../pci/msi.h" + +void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) +{ + spin_lock(&irq_mapping_update_lock); + + if (alloc & XEN_ALLOC_IRQ) { + *irq = find_unbound_irq(); + if (*irq == -1) + goto out; + } + + if (alloc & XEN_ALLOC_PIRQ) { + *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); + if (*pirq == -1) + goto out; + } + + set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, + handle_level_irq, name); + + irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); + pirq_to_irq[*pirq] = *irq; + +out: + spin_unlock(&irq_mapping_update_lock); +} + +int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) +{ + int irq = -1; + struct physdev_map_pirq map_irq; + int rc; + int pos; + u32 table_offset, bir; + + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; + + if (type == PCI_CAP_ID_MSIX) { + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, msix_table_offset_reg(pos), + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + spin_lock(&irq_mapping_update_lock); + + irq = find_unbound_irq(); + + if (irq == -1) + goto out; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed %d\n", rc); + + irq_free_desc(irq); + + irq = -1; + goto out; + } + irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); + + set_irq_chip_and_handler_name(irq, &xen_pirq_chip, + handle_level_irq, + (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + +out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} +#endif + +int xen_destroy_irq(int irq) +{ + struct irq_desc *desc; + struct physdev_unmap_pirq unmap_irq; + struct irq_info *info = info_for_irq(irq); + int rc = -ENOENT; + + spin_lock(&irq_mapping_update_lock); + + desc = irq_to_desc(irq); + if (!desc) + goto out; + + if (xen_initial_domain()) { + unmap_irq.pirq = info->u.pirq.pirq; + unmap_irq.domid = DOMID_SELF; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); + if (rc) { + printk(KERN_WARNING "unmap irq failed %d\n", rc); + goto out; + } + pirq_to_irq[info->u.pirq.pirq] = -1; + } + irq_info[irq] = mk_unbound_info(); + + irq_free_desc(irq); + +out: + spin_unlock(&irq_mapping_update_lock); + return rc; +} + +int xen_vector_from_irq(unsigned irq) +{ + return vector_from_irq(irq); +} + +int xen_gsi_from_irq(unsigned irq) +{ + return gsi_from_irq(irq); +} + +int xen_irq_from_pirq(unsigned pirq) +{ + return pirq_to_irq[pirq]; +} + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; @@ -362,7 +817,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq = find_unbound_irq(); set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); + handle_fasteoi_irq, "event"); evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_evtchn_info(evtchn); @@ -388,8 +843,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) if (irq < 0) goto out; - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); + set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "ipi"); bind_ipi.vcpu = cpu; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, @@ -410,7 +865,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) } -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; int evtchn, irq; @@ -420,6 +875,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { + irq = find_unbound_irq(); + + set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "virq"); + bind_virq.virq = virq; bind_virq.vcpu = cpu; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, @@ -427,11 +887,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) BUG(); evtchn = bind_virq.port; - irq = find_unbound_irq(); - - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_virq_info(evtchn, virq); @@ -474,9 +929,12 @@ static void unbind_from_irq(unsigned int irq) bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; + } + + if (irq_info[irq].type != IRQT_UNBOUND) { irq_info[irq] = mk_unbound_info(); - dynamic_irq_cleanup(irq); + irq_free_desc(irq); } spin_unlock(&irq_mapping_update_lock); @@ -532,6 +990,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; + irqflags |= IRQF_NO_SUSPEND; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -559,41 +1018,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) { struct shared_info *sh = HYPERVISOR_shared_info; int cpu = smp_processor_id(); + unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); int i; unsigned long flags; static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; spin_lock_irqsave(&debug_lock, flags); - printk("vcpu %d\n ", cpu); + printk("\nvcpu %d\n ", cpu); for_each_online_cpu(i) { - struct vcpu_info *v = per_cpu(xen_vcpu, i); - printk("%d: masked=%d pending=%d event_sel %08lx\n ", i, - (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask, - v->evtchn_upcall_pending, - v->evtchn_pending_sel); - } - printk("pending:\n "); - for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nmasks:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nunmasked:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*lx%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) + printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + unsigned long pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } printk("\npending list:\n"); - for(i = 0; i < NR_EVENT_CHANNELS; i++) { + for (i = 0; i < NR_EVENT_CHANNELS; i++) { if (sync_test_bit(i, sh->evtchn_pending)) { - printk(" %d: event %d -> irq %d\n", + int word_idx = i / BITS_PER_LONG; + printk(" %d: event %d -> irq %d%s%s%s\n", cpu_from_evtchn(i), i, - evtchn_to_irq[i]); + evtchn_to_irq[i], + sync_test_bit(word_idx, &v->evtchn_pending_sel) + ? "" : " l2-clear", + !sync_test_bit(i, sh->evtchn_mask) + ? "" : " globally-masked", + sync_test_bit(i, cpu_evtchn) + ? "" : " locally-masked"); } } @@ -602,6 +1095,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -611,24 +1106,19 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -void xen_evtchn_do_upcall(struct pt_regs *regs) +static void __xen_evtchn_do_upcall(void) { int cpu = get_cpu(); - struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); unsigned count; - exit_idle(); - irq_enter(); - do { unsigned long pending_words; vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -645,24 +1135,48 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) int bit_idx = __ffs(pending_bits); int port = (word_idx * BITS_PER_LONG) + bit_idx; int irq = evtchn_to_irq[port]; + struct irq_desc *desc; - if (irq != -1) - handle_irq(irq, regs); + mask_evtchn(port); + clear_evtchn(port); + + if (irq != -1) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } } } BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; - } while(count != 1); + count = __this_cpu_read(xed_nesting_count); + __this_cpu_write(xed_nesting_count, 0); + } while (count != 1 || vcpu_info->evtchn_upcall_pending); out: + + put_cpu(); +} + +void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + exit_idle(); + irq_enter(); + + __xen_evtchn_do_upcall(); + irq_exit(); set_irq_regs(old_regs); +} - put_cpu(); +void xen_hvm_evtchn_do_upcall(void) +{ + __xen_evtchn_do_upcall(); } +EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); /* Rebind a new event channel to an existing irq. */ void rebind_evtchn_irq(int evtchn, int irq) @@ -699,7 +1213,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); - if (!VALID_EVTCHN(evtchn)) + /* events delivered via platform PCI interrupts are always + * routed to vcpu 0 */ + if (!VALID_EVTCHN(evtchn) || + (xen_hvm_domain() && !xen_have_vector_callback)) return -1; /* Send future instances of this interrupt to other vcpu. */ @@ -760,10 +1277,10 @@ static void ack_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); - move_native_irq(irq); + move_masked_irq(irq); if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); + unmask_evtchn(evtchn); } static int retrigger_dynirq(unsigned int irq) @@ -785,6 +1302,42 @@ static int retrigger_dynirq(unsigned int irq) return ret; } +static void restore_cpu_pirqs(void) +{ + int pirq, rc, irq, gsi; + struct physdev_map_pirq map_irq; + + for (pirq = 0; pirq < nr_irqs; pirq++) { + irq = pirq_to_irq[pirq]; + if (irq == -1) + continue; + + /* save/restore of PT devices doesn't work, so at this point the + * only devices present are GSI based emulated devices */ + gsi = gsi_from_irq(irq); + if (!gsi) + continue; + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = pirq; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); + irq_info[irq] = mk_unbound_info(); + pirq_to_irq[pirq] = -1; + continue; + } + + printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + + startup_pirq(irq); + } +} + static void restore_cpu_virqs(unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -808,9 +1361,6 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_virq_info(evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); - - /* Ready for use. */ - unmask_evtchn(evtchn); } } @@ -836,10 +1386,6 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_ipi_info(evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); - - /* Ready for use. */ - unmask_evtchn(evtchn); - } } @@ -851,7 +1397,7 @@ void xen_clear_irq_pending(int irq) if (VALID_EVTCHN(evtchn)) clear_evtchn(evtchn); } - +EXPORT_SYMBOL(xen_clear_irq_pending); void xen_set_irq_pending(int irq) { int evtchn = evtchn_from_irq(irq); @@ -871,9 +1417,9 @@ bool xen_test_irq_pending(int irq) return ret; } -/* Poll waiting for an irq to become pending. In the usual case, the - irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq(int irq) +/* Poll waiting for an irq to become pending with timeout. In the usual case, + * the irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq_timeout(int irq, u64 timeout) { evtchn_port_t evtchn = evtchn_from_irq(irq); @@ -881,17 +1427,25 @@ void xen_poll_irq(int irq) struct sched_poll poll; poll.nr_ports = 1; - poll.timeout = 0; + poll.timeout = timeout; set_xen_guest_handle(poll.ports, &evtchn); if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) BUG(); } } +EXPORT_SYMBOL(xen_poll_irq_timeout); +/* Poll waiting for an irq to become pending. In the usual case, the + * irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ + xen_poll_irq_timeout(irq, 0 /* no timeout */); +} void xen_irq_resume(void) { unsigned int cpu, irq, evtchn; + struct irq_desc *desc; init_evtchn_cpu_bindings(); @@ -910,6 +1464,25 @@ void xen_irq_resume(void) restore_cpu_virqs(cpu); restore_cpu_ipis(cpu); } + + /* + * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These + * are not handled by the IRQ core. + */ + for_each_irq_desc(irq, desc) { + if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) + continue; + if (desc->status & IRQ_DISABLED) + continue; + + evtchn = evtchn_from_irq(irq); + if (evtchn == -1) + continue; + + unmask_evtchn(evtchn); + } + + restore_cpu_pirqs(); } static struct irq_chip xen_dynamic_chip __read_mostly = { @@ -919,18 +1492,98 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .mask = disable_dynirq, .unmask = enable_dynirq, - .ack = ack_dynirq, + .eoi = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_pirq_chip __read_mostly = { + .name = "xen-pirq", + + .startup = startup_pirq, + .shutdown = shutdown_pirq, + + .enable = enable_pirq, + .unmask = enable_pirq, + + .disable = disable_pirq, + .mask = disable_pirq, + + .ack = ack_pirq, + .end = end_pirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, }; +static struct irq_chip xen_percpu_chip __read_mostly = { + .name = "xen-percpu", + + .disable = disable_dynirq, + .mask = disable_dynirq, + .unmask = enable_dynirq, + + .ack = ack_dynirq, +}; + +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + return HYPERVISOR_hvm_op(HVMOP_set_param, &a); +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_callback_vector(void) +{ + int rc; + uint64_t callback_via; + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + rc = xen_set_callback_via(callback_via); + if (rc) { + printk(KERN_ERR "Request for Xen HVM callback vector" + " failed.\n"); + xen_have_vector_callback = 0; + return; + } + printk(KERN_INFO "Xen HVM callback vector for event delivery is " + "enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) + alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + } +} +#else +void xen_callback_vector(void) {} +#endif + void __init xen_init_IRQ(void) { int i; cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), GFP_KERNEL); - BUG_ON(cpu_evtchn_mask_p == NULL); + irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); + + /* We are using nr_irqs as the maximum number of pirq available but + * that number is actually chosen by Xen and we don't know exactly + * what it is. Be careful choosing high pirq numbers. */ + pirq_to_irq = kcalloc(nr_irqs, sizeof(*pirq_to_irq), GFP_KERNEL); + for (i = 0; i < nr_irqs; i++) + pirq_to_irq[i] = -1; + + evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), + GFP_KERNEL); + for (i = 0; i < NR_EVENT_CHANNELS; i++) + evtchn_to_irq[i] = -1; init_evtchn_cpu_bindings(); @@ -938,5 +1591,15 @@ void __init xen_init_IRQ(void) for (i = 0; i < NR_EVENT_CHANNELS; i++) mask_evtchn(i); - irq_ctx_init(smp_processor_id()); + if (xen_hvm_domain()) { + xen_callback_vector(); + native_init_IRQ(); + /* pci_xen_hvm_init must be called after native_init_IRQ so that + * __acpi_register_gsi can point at the right function */ + pci_xen_hvm_init(); + } else { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + xen_setup_pirqs(); + } } diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index af03195..ef11daf 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -38,7 +38,6 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/fs.h> -#include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/major.h> #include <linux/proc_fs.h> @@ -46,9 +45,10 @@ #include <linux/poll.h> #include <linux/irq.h> #include <linux/init.h> -#include <linux/gfp.h> #include <linux/mutex.h> #include <linux/cpu.h> + +#include <xen/xen.h> #include <xen/events.h> #include <xen/evtchn.h> #include <asm/xen/hypervisor.h> @@ -69,20 +69,51 @@ struct per_user_data { const char *name; }; -/* Who's bound to each port? */ -static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +/* + * Who's bound to each port? This is logically an array of struct + * per_user_data *, but we encode the current enabled-state in bit 0. + */ +static unsigned long *port_user; static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ -irqreturn_t evtchn_interrupt(int irq, void *data) +static inline struct per_user_data *get_port_user(unsigned port) +{ + return (struct per_user_data *)(port_user[port] & ~1); +} + +static inline void set_port_user(unsigned port, struct per_user_data *u) +{ + port_user[port] = (unsigned long)u; +} + +static inline bool get_port_enabled(unsigned port) +{ + return port_user[port] & 1; +} + +static inline void set_port_enabled(unsigned port, bool enabled) +{ + if (enabled) + port_user[port] |= 1; + else + port_user[port] &= ~1; +} + +static irqreturn_t evtchn_interrupt(int irq, void *data) { unsigned int port = (unsigned long)data; struct per_user_data *u; spin_lock(&port_user_lock); - u = port_user[port]; + u = get_port_user(port); + + WARN(!get_port_enabled(port), + "Interrupt for port %d, but apparently not enabled; per-user %p\n", + port, u); disable_irq_nosync(irq); + set_port_enabled(port, false); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; @@ -92,9 +123,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data) kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); } - } else { + } else u->ring_overflow = 1; - } spin_unlock(&port_user_lock); @@ -198,9 +228,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, goto out; spin_lock_irq(&port_user_lock); - for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) - if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) - enable_irq(irq_from_evtchn(kbuf[i])); + + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { + unsigned port = kbuf[i]; + + if (port < NR_EVENT_CHANNELS && + get_port_user(port) == u && + !get_port_enabled(port)) { + set_port_enabled(port, true); + enable_irq(irq_from_evtchn(port)); + } + } + spin_unlock_irq(&port_user_lock); rc = count; @@ -222,8 +261,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(port_user[port] != NULL); - port_user[port] = u; + BUG_ON(get_port_user(port) != NULL); + set_port_user(port, u); + set_port_enabled(port, true); /* start enabled */ rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, u->name, (void *)(unsigned long)port); @@ -239,10 +279,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port) unbind_from_irqhandler(irq, (void *)(unsigned long)port); - /* make sure we unbind the irq handler before clearing the port */ - barrier(); - - port_user[port] = NULL; + set_port_user(port, NULL); } static long evtchn_ioctl(struct file *file, @@ -333,15 +370,17 @@ static long evtchn_ioctl(struct file *file, spin_lock_irq(&port_user_lock); rc = -ENOTCONN; - if (port_user[unbind.port] != u) { + if (get_port_user(unbind.port) != u) { spin_unlock_irq(&port_user_lock); break; } - evtchn_unbind_from_user(u, unbind.port); + disable_irq(irq_from_evtchn(unbind.port)); spin_unlock_irq(&port_user_lock); + evtchn_unbind_from_user(u, unbind.port); + rc = 0; break; } @@ -355,7 +394,7 @@ static long evtchn_ioctl(struct file *file, if (notify.port >= NR_EVENT_CHANNELS) { rc = -EINVAL; - } else if (port_user[notify.port] != u) { + } else if (get_port_user(notify.port) != u) { rc = -ENOTCONN; } else { notify_remote_via_evtchn(notify.port); @@ -431,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) filp->private_data = u; - return 0; + return nonseekable_open(inode, filp);; } static int evtchn_release(struct inode *inode, struct file *filp) @@ -444,14 +483,21 @@ static int evtchn_release(struct inode *inode, struct file *filp) free_page((unsigned long)u->ring); for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (port_user[i] != u) + if (get_port_user(i) != u) continue; - evtchn_unbind_from_user(port_user[i], i); + disable_irq(irq_from_evtchn(i)); } spin_unlock_irq(&port_user_lock); + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + if (get_port_user(i) != u) + continue; + + evtchn_unbind_from_user(get_port_user(i), i); + } + kfree(u->name); kfree(u); @@ -467,11 +513,12 @@ static const struct file_operations evtchn_fops = { .fasync = evtchn_fasync, .open = evtchn_open, .release = evtchn_release, + .llseek = no_llseek, }; static struct miscdevice evtchn_miscdev = { .minor = MISC_DYNAMIC_MINOR, - .name = "evtchn", + .name = "xen/evtchn", .fops = &evtchn_fops, }; static int __init evtchn_init(void) @@ -481,8 +528,11 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; + port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); + if (port_user == NULL) + return -ENOMEM; + spin_lock_init(&port_user_lock); - memset(port_user, 0, sizeof(port_user)); /* Create '/dev/misc/evtchn'. */ err = misc_register(&evtchn_miscdev); @@ -498,6 +548,9 @@ static int __init evtchn_init(void) static void __exit evtchn_cleanup(void) { + kfree(port_user); + port_user = NULL; + misc_deregister(&evtchn_miscdev); } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 0000000..1e31cdc --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,665 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_notifier.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> + +#include <xen/xen.h> +#include <xen/grant_table.h> +#include <xen/gntdev.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " + "Gerd Hoffmann <kraxel@redhat.com>"); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " + "once by a gntdev instance"); + +struct gntdev_priv { + struct list_head maps; + uint32_t used; + uint32_t limit; + /* lock protects maps from concurrent changes */ + spinlock_t lock; + struct mm_struct *mm; + struct mmu_notifier mn; +}; + +struct grant_map { + struct list_head next; + struct gntdev_priv *priv; + struct vm_area_struct *vma; + int index; + int count; + int flags; + int is_mapped; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; + struct page **pages; +}; + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, + char *text, int text_index) +{ +#ifdef DEBUG + struct grant_map *map; + + pr_debug("maps list (priv %p, usage %d/%d)\n", + priv, priv->used, priv->limit); + + list_for_each_entry(map, &priv->maps, next) + pr_debug(" index %2d, count %2d %s\n", + map->index, map->count, + map->index == text_index && text ? text : ""); +#endif +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ + struct grant_map *add; + int i; + + add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + if (NULL == add) + return NULL; + + add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); + add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); + add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || + NULL == add->pages) + goto err; + + for (i = 0; i < count; i++) { + add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (add->pages[i] == NULL) + goto err; + } + + add->index = 0; + add->count = count; + add->priv = priv; + + if (add->count + priv->used > priv->limit) + goto err; + + return add; + +err: + if (add->pages) + for (i = 0; i < count; i++) { + if (add->pages[i]) + __free_page(add->pages[i]); + } + kfree(add->pages); + kfree(add->grants); + kfree(add->map_ops); + kfree(add->unmap_ops); + kfree(add); + return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (add->index + add->count < map->index) { + list_add_tail(&add->next, &map->next); + goto done; + } + add->index = map->index + map->count; + } + list_add_tail(&add->next, &priv->maps); + +done: + priv->used += add->count; + gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (map->index != index) + continue; + if (map->count != count) + continue; + return map; + } + return NULL; +} + +static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, + unsigned long vaddr) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (vaddr < map->vma->vm_start) + continue; + if (vaddr >= map->vma->vm_end) + continue; + return map; + } + return NULL; +} + +static int gntdev_del_map(struct grant_map *map) +{ + int i; + + if (map->vma) + return -EBUSY; + for (i = 0; i < map->count; i++) + if (map->unmap_ops[i].handle) + return -EBUSY; + + map->priv->used -= map->count; + list_del(&map->next); + return 0; +} + +static void gntdev_free_map(struct grant_map *map) +{ + int i; + + if (!map) + return; + + if (map->pages) + for (i = 0; i < map->count; i++) { + if (map->pages[i]) + __free_page(map->pages[i]); + } + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + struct grant_map *map = data; + unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + u64 pte_maddr; + + BUG_ON(pgnr >= map->count); + pte_maddr = arbitrary_virt_to_machine(pte).maddr; + + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + map->grants[pgnr].ref, + map->grants[pgnr].domid); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, + 0 /* handle */); + return 0; +} + +static int map_grant_pages(struct grant_map *map) +{ + int i, err = 0; + + pr_debug("map %d+%d\n", map->index, map->count); + err = gnttab_map_refs(map->map_ops, map->pages, map->count); + if (err) + return err; + + for (i = 0; i < map->count; i++) { + if (map->map_ops[i].status) + err = -EINVAL; + map->unmap_ops[i].handle = map->map_ops[i].handle; + } + return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int i, err = 0; + + pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); + if (err) + return err; + + for (i = 0; i < pages; i++) { + if (map->unmap_ops[offset+i].status) + err = -EINVAL; + map->unmap_ops[offset+i].handle = 0; + } + return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("close %p\n", vma); + map->is_mapped = 0; + map->vma = NULL; + vma->vm_private_data = NULL; +} + +static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", + vmf->virtual_address, vmf->pgoff); + vmf->flags = VM_FAULT_ERROR; + return 0; +} + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .fault = gntdev_vma_fault, +}; + +/* ------------------------------------------------------------------ */ + +static void mn_invl_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + unsigned long mstart, mend; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (!map->is_mapped) + continue; + if (map->vma->vm_start >= end) + continue; + if (map->vma->vm_end <= start) + continue; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +struct mmu_notifier_ops gntdev_mmu_ops = { + .release = mn_release, + .invalidate_page = mn_invl_page, + .invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv; + int ret = 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->maps); + spin_lock_init(&priv->lock); + priv->limit = limit; + + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); + + if (ret) { + kfree(priv); + return ret; + } + + flip->private_data = priv; + pr_debug("priv %p\n", priv); + + return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv = flip->private_data; + struct grant_map *map; + int err; + + pr_debug("priv %p\n", priv); + + spin_lock(&priv->lock); + while (!list_empty(&priv->maps)) { + map = list_entry(priv->maps.next, struct grant_map, next); + err = gntdev_del_map(map); + if (WARN_ON(err)) + gntdev_free_map(map); + + } + spin_unlock(&priv->lock); + + mmu_notifier_unregister(&priv->mn, priv->mm); + kfree(priv); + return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_map_grant_ref __user *u) +{ + struct ioctl_gntdev_map_grant_ref op; + struct grant_map *map; + int err; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, add %d\n", priv, op.count); + if (unlikely(op.count <= 0)) + return -EINVAL; + if (unlikely(op.count > priv->limit)) + return -EINVAL; + + err = -ENOMEM; + map = gntdev_alloc_map(priv, op.count); + if (!map) + return err; + if (copy_from_user(map->grants, &u->refs, + sizeof(map->grants[0]) * op.count) != 0) { + gntdev_free_map(map); + return err; + } + + spin_lock(&priv->lock); + gntdev_add_map(priv, map); + op.index = map->index << PAGE_SHIFT; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) { + spin_lock(&priv->lock); + gntdev_del_map(map); + spin_unlock(&priv->lock); + gntdev_free_map(map); + return err; + } + return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_unmap_grant_ref __user *u) +{ + struct ioctl_gntdev_unmap_grant_ref op; + struct grant_map *map; + int err = -ENOENT; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); + if (map) + err = gntdev_del_map(map); + spin_unlock(&priv->lock); + if (!err) + gntdev_free_map(map); + return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, + struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ + struct ioctl_gntdev_get_offset_for_vaddr op; + struct grant_map *map; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + + spin_lock(&priv->lock); + map = gntdev_find_map_vaddr(priv, op.vaddr); + if (map == NULL || + map->vma->vm_start != op.vaddr) { + spin_unlock(&priv->lock); + return -EINVAL; + } + op.offset = map->index << PAGE_SHIFT; + op.count = map->count; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; +} + +static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, + struct ioctl_gntdev_set_max_grants __user *u) +{ + struct ioctl_gntdev_set_max_grants op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, limit %d\n", priv, op.count); + if (op.count > limit) + return -E2BIG; + + spin_lock(&priv->lock); + priv->limit = op.count; + spin_unlock(&priv->lock); + return 0; +} + +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + struct gntdev_priv *priv = flip->private_data; + void __user *ptr = (void __user *)arg; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + return gntdev_ioctl_map_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + return gntdev_ioctl_unmap_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + + case IOCTL_GNTDEV_SET_MAX_GRANTS: + return gntdev_ioctl_set_max_grants(priv, ptr); + + default: + pr_debug("priv %p, unknown cmd %x\n", priv, cmd); + return -ENOIOCTLCMD; + } + + return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ + struct gntdev_priv *priv = flip->private_data; + int index = vma->vm_pgoff; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct grant_map *map; + int err = -EINVAL; + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + pr_debug("map %d+%d at %lx (pgoff %lx)\n", + index, count, vma->vm_start, vma->vm_pgoff); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, index, count); + if (!map) + goto unlock_out; + if (map->vma) + goto unlock_out; + if (priv->mm != vma->vm_mm) { + printk(KERN_WARNING "Huh? Other mm?\n"); + goto unlock_out; + } + + vma->vm_ops = &gntdev_vmops; + + vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; + + vma->vm_private_data = map; + map->vma = vma; + + map->flags = GNTMAP_host_map | GNTMAP_application_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + + spin_unlock(&priv->lock); + + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + return err; + } + + err = map_grant_pages(map); + if (err) { + printk(KERN_WARNING "map_grant_pages() failure.\n"); + return err; + } + + map->is_mapped = 1; + + return 0; + +unlock_out: + spin_unlock(&priv->lock); + return err; +} + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntdev", + .fops = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntdev_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register gntdev device\n"); + return err; + } + return 0; +} + +static void __exit gntdev_exit(void) +{ + misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7d8f531..9ef54eb 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -34,12 +34,16 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <linux/io.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/page.h> #include <xen/grant_table.h> +#include <xen/interface/memory.h> #include <asm/xen/hypercall.h> #include <asm/pgtable.h> @@ -57,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); +unsigned long xen_hvm_resume_frames; +EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); static struct grant_entry *shared; @@ -431,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void) return query.max_nr_frames; } -static inline unsigned int max_nr_grant_frames(void) +unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); @@ -439,6 +445,53 @@ static inline unsigned int max_nr_grant_frames(void) return boot_max_nr_grant_frames; return xen_max; } +EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); + +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + pte_t *pte; + unsigned long mfn; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + /* m2p override only supported for GNTMAP_contains_pte mappings */ + if (!(map_ops[i].flags & GNTMAP_contains_pte)) + continue; + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + ret = m2p_add_override(mfn, pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + ret = m2p_remove_override(pages[i]); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { @@ -447,6 +500,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; + if (xen_hvm_domain()) { + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + rc = 0; + /* + * Loop backwards, so that the first hypercall has the largest + * index, ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc != 0) { + printk(KERN_WARNING + "grant table add_to_physmap failed, err=%d\n", rc); + break; + } + } while (i-- > start_idx); + + return rc; + } + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; @@ -463,7 +540,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) BUG_ON(rc || setup.status); - rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(), + rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), &shared); BUG_ON(rc); @@ -474,9 +551,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) int gnttab_resume(void) { - if (max_nr_grant_frames() < nr_grant_frames) + unsigned int max_nr_gframes; + + max_nr_gframes = gnttab_max_grant_frames(); + if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - return gnttab_map(0, nr_grant_frames - 1); + + if (xen_pv_domain()) + return gnttab_map(0, nr_grant_frames - 1); + + if (!shared) { + shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); + if (shared == NULL) { + printk(KERN_WARNING + "Failed to ioremap gnttab share frames!"); + return -ENOMEM; + } + } + + gnttab_map(0, nr_grant_frames - 1); + + return 0; } int gnttab_suspend(void) @@ -493,7 +588,7 @@ static int gnttab_expand(unsigned int req_entries) cur = nr_grant_frames; extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / GREFS_PER_GRANT_FRAME); - if (cur + extra > max_nr_grant_frames()) + if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; rc = gnttab_map(cur, cur + extra - 1); @@ -503,15 +598,12 @@ static int gnttab_expand(unsigned int req_entries) return rc; } -static int __devinit gnttab_init(void) +int gnttab_init(void) { int i; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; - if (!xen_domain()) - return -ENODEV; - nr_grant_frames = 1; boot_max_nr_grant_frames = __max_nr_grant_frames(); @@ -554,5 +646,18 @@ static int __devinit gnttab_init(void) kfree(gnttab_list); return -ENOMEM; } +EXPORT_SYMBOL_GPL(gnttab_init); + +static int __devinit __gnttab_init(void) +{ + /* Delay grant-table initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return gnttab_init(); +} -core_initcall(gnttab_init); +core_initcall(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 10d03d7..db8c4c4 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -3,11 +3,13 @@ */ #include <linux/kernel.h> #include <linux/err.h> +#include <linux/slab.h> #include <linux/reboot.h> #include <linux/sysrq.h> #include <linux/stop_machine.h> #include <linux/freezer.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/grant_table.h> #include <xen/events.h> @@ -16,6 +18,7 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> enum shutdown_state { SHUTDOWN_INVALID = -1, @@ -32,10 +35,31 @@ enum shutdown_state { static enum shutdown_state shutting_down = SHUTDOWN_INVALID; #ifdef CONFIG_PM_SLEEP -static int xen_suspend(void *data) +static int xen_hvm_suspend(void *data) { + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; int *cancelled = data; + + BUG_ON(!irqs_disabled()); + + *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); + + xen_hvm_post_suspend(*cancelled); + gnttab_resume(); + + if (!*cancelled) { + xen_irq_resume(); + xen_console_resume(); + xen_timer_resume(); + } + + return 0; +} + +static int xen_suspend(void *data) +{ int err; + int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -43,7 +67,6 @@ static int xen_suspend(void *data) if (err) { printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", err); - dpm_resume_noirq(PMSG_RESUME); return err; } @@ -69,7 +92,6 @@ static int xen_suspend(void *data) } sysdev_resume(); - dpm_resume_noirq(PMSG_RESUME); return 0; } @@ -88,14 +110,14 @@ static void do_suspend(void) err = freeze_processes(); if (err) { printk(KERN_ERR "xen suspend: freeze failed %d\n", err); - return; + goto out; } #endif err = dpm_suspend_start(PMSG_SUSPEND); if (err) { printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); - goto out; + goto out_thaw; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -104,31 +126,37 @@ static void do_suspend(void) err = dpm_suspend_noirq(PMSG_SUSPEND); if (err) { printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); - goto resume_devices; + goto out_resume; } - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + if (xen_hvm_domain()) + err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); + else + err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + + dpm_resume_noirq(PMSG_RESUME); + if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); - goto out; + cancelled = 1; } +out_resume: if (!cancelled) { xen_arch_resume(); xs_resume(); } else xs_suspend_cancel(); - dpm_resume_noirq(PMSG_RESUME); - -resume_devices: dpm_resume_end(PMSG_RESUME); /* Make sure timer events get retriggered on all CPUs */ clock_was_set(); -out: + +out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); +out: #endif shutting_down = SHUTDOWN_INVALID; } @@ -183,6 +211,7 @@ static void shutdown_handler(struct xenbus_watch *watch, kfree(str); } +#ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) { @@ -209,18 +238,19 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, goto again; if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); + handle_sysrq(sysrq_key); } -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - static struct xenbus_watch sysrq_watch = { .node = "control/sysrq", .callback = sysrq_handler }; +#endif + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; static int setup_shutdown_watcher(void) { @@ -232,11 +262,13 @@ static int setup_shutdown_watcher(void) return err; } +#ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { printk(KERN_ERR "Failed to set sysrq watcher\n"); return err; } +#endif return 0; } @@ -249,7 +281,19 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init setup_shutdown_event(void) +static int __init __setup_shutdown_event(void) +{ + /* Delay initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return xen_setup_shutdown_event(); +} + +int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event @@ -258,5 +302,6 @@ static int __init setup_shutdown_event(void) return 0; } +EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(setup_shutdown_event); +subsys_initcall(__setup_shutdown_event); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c new file mode 100644 index 0000000..cef4bafc0 --- /dev/null +++ b/drivers/xen/pci.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Weidong Han <weidong.han@intel.com> + */ + +#include <linux/pci.h> +#include <xen/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/xen.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include "../pci/pci.h" + +static int xen_add_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_virtfn = 1, + .physfn.bus = pci_dev->physfn->bus->number, + .physfn.devfn = pci_dev->physfn->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_extfn = 1, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, + &manage_pci); + } + + return r; +} + +static int xen_remove_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + + manage_pci.bus = pci_dev->bus->number; + manage_pci.devfn = pci_dev->devfn; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci); + + return r; +} + +static int xen_pci_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + int r = 0; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + r = xen_add_device(dev); + break; + case BUS_NOTIFY_DEL_DEVICE: + r = xen_remove_device(dev); + break; + default: + break; + } + + return r; +} + +struct notifier_block device_nb = { + .notifier_call = xen_pci_notifier, +}; + +static int __init register_xen_pci_notifier(void) +{ + if (!xen_initial_domain()) + return 0; + + return bus_register_notifier(&pci_bus_type, &device_nb); +} + +arch_initcall(register_xen_pci_notifier); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c new file mode 100644 index 0000000..afbe041 --- /dev/null +++ b/drivers/xen/platform-pci.c @@ -0,0 +1,200 @@ +/****************************************************************************** + * platform-pci.c + * + * Xen platform PCI device driver + * Copyright (c) 2005, Intel Corporation. + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <xen/platform_pci.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> + +#define DRV_NAME "xen-platform-pci" + +MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); +MODULE_DESCRIPTION("Xen platform PCI device"); +MODULE_LICENSE("GPL"); + +static unsigned long platform_mmio; +static unsigned long platform_mmio_alloc; +static unsigned long platform_mmiolen; +static uint64_t callback_via; + +unsigned long alloc_xen_mmio(unsigned long len) +{ + unsigned long addr; + + addr = platform_mmio + platform_mmio_alloc; + platform_mmio_alloc += len; + BUG_ON(platform_mmio_alloc > platform_mmiolen); + + return addr; +} + +static uint64_t get_callback_via(struct pci_dev *pdev) +{ + u8 pin; + int irq; + + irq = pdev->irq; + if (irq < 16) + return irq; /* ISA IRQ */ + + pin = pdev->pin; + + /* We don't know the GSI. Specify the PCI INTx line instead. */ + return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ + ((uint64_t)pci_domain_nr(pdev->bus) << 32) | + ((uint64_t)pdev->bus->number << 16) | + ((uint64_t)(pdev->devfn & 0xff) << 8) | + ((uint64_t)(pin - 1) & 3); +} + +static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) +{ + xen_hvm_evtchn_do_upcall(); + return IRQ_HANDLED; +} + +static int xen_allocate_irq(struct pci_dev *pdev) +{ + return request_irq(pdev->irq, do_hvm_evtchn_intr, + IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + "xen-platform-pci", pdev); +} + +static int platform_pci_resume(struct pci_dev *pdev) +{ + int err; + if (xen_have_vector_callback) + return 0; + err = xen_set_callback_via(callback_via); + if (err) { + dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + return err; + } + return 0; +} + +static int __devinit platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int i, ret; + long ioaddr; + long mmio_addr, mmio_len; + unsigned int max_nr_gframes; + + i = pci_enable_device(pdev); + if (i) + return i; + + ioaddr = pci_resource_start(pdev, 0); + + mmio_addr = pci_resource_start(pdev, 1); + mmio_len = pci_resource_len(pdev, 1); + + if (mmio_addr == 0 || ioaddr == 0) { + dev_err(&pdev->dev, "no resources found\n"); + ret = -ENOENT; + goto pci_out; + } + + ret = pci_request_region(pdev, 1, DRV_NAME); + if (ret < 0) + goto pci_out; + + ret = pci_request_region(pdev, 0, DRV_NAME); + if (ret < 0) + goto mem_out; + + platform_mmio = mmio_addr; + platform_mmiolen = mmio_len; + + if (!xen_have_vector_callback) { + ret = xen_allocate_irq(pdev); + if (ret) { + dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); + goto out; + } + callback_via = get_callback_via(pdev); + ret = xen_set_callback_via(callback_via); + if (ret) { + dev_warn(&pdev->dev, "Unable to set the evtchn callback " + "err=%d\n", ret); + goto out; + } + } + + max_nr_gframes = gnttab_max_grant_frames(); + xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_init(); + if (ret) + goto out; + xenbus_probe(NULL); + ret = xen_setup_shutdown_event(); + if (ret) + goto out; + return 0; + +out: + pci_release_region(pdev, 0); +mem_out: + pci_release_region(pdev, 1); +pci_out: + pci_disable_device(pdev); + return ret; +} + +static struct pci_device_id platform_pci_tbl[] __devinitdata = { + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, platform_pci_tbl); + +static struct pci_driver platform_driver = { + .name = DRV_NAME, + .probe = platform_pci_init, + .id_table = platform_pci_tbl, +#ifdef CONFIG_PM + .resume_early = platform_pci_resume, +#endif +}; + +static int __init platform_pci_module_init(void) +{ + /* no unplug has been done, IGNORE hasn't been specified: just + * return now */ + if (!xen_platform_pci_unplug) + return -ENODEV; + + return pci_register_driver(&platform_driver); +} + +module_init(platform_pci_module_init); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c new file mode 100644 index 0000000..54469c3 --- /dev/null +++ b/drivers/xen/swiotlb-xen.c @@ -0,0 +1,515 @@ +/* + * Copyright 2010 + * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code provides a IOMMU for Xen PV guests with PCI passthrough. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * PV guests under Xen are running in an non-contiguous memory architecture. + * + * When PCI pass-through is utilized, this necessitates an IOMMU for + * translating bus (DMA) to virtual and vice-versa and also providing a + * mechanism to have contiguous pages for device drivers operations (say DMA + * operations). + * + * Specifically, under Xen the Linux idea of pages is an illusion. It + * assumes that pages start at zero and go up to the available memory. To + * help with that, the Linux Xen MMU provides a lookup mechanism to + * translate the page frame numbers (PFN) to machine frame numbers (MFN) + * and vice-versa. The MFN are the "real" frame numbers. Furthermore + * memory is not contiguous. Xen hypervisor stitches memory for guests + * from different pools, which means there is no guarantee that PFN==MFN + * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are + * allocated in descending order (high to low), meaning the guest might + * never get any MFN's under the 4GB mark. + * + */ + +#include <linux/bootmem.h> +#include <linux/dma-mapping.h> +#include <xen/swiotlb-xen.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ + +static char *xen_io_tlb_start, *xen_io_tlb_end; +static unsigned long xen_io_tlb_nslabs; +/* + * Quick lookup value of the bus address of the IOTLB. + */ + +u64 start_dma_addr; + +static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +{ + return phys_to_machine(XPADDR(paddr)).maddr;; +} + +static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +{ + return machine_to_phys(XMADDR(baddr)).paddr; +} + +static dma_addr_t xen_virt_to_bus(void *address) +{ + return xen_phys_to_bus(virt_to_phys(address)); +} + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +static int range_straddles_page_boundary(phys_addr_t p, size_t size) +{ + unsigned long pfn = PFN_DOWN(p); + unsigned int offset = p & ~PAGE_MASK; + + if (offset + size <= PAGE_SIZE) + return 0; + if (check_pages_physically_contiguous(pfn, offset, size)) + return 0; + return 1; +} + +static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +{ + unsigned long mfn = PFN_DOWN(dma_addr); + unsigned long pfn = mfn_to_local_pfn(mfn); + phys_addr_t paddr; + + /* If the address is outside our domain, it CAN + * have the same virtual address as another address + * in our domain. Therefore _only_ check address within our domain. + */ + if (pfn_valid(pfn)) { + paddr = PFN_PHYS(pfn); + return paddr >= virt_to_phys(xen_io_tlb_start) && + paddr < virt_to_phys(xen_io_tlb_end); + } + return 0; +} + +static int max_dma_bits = 32; + +static int +xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +{ + int i, rc; + int dma_bits; + + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + + i = 0; + do { + int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); + + do { + rc = xen_create_contiguous_region( + (unsigned long)buf + (i << IO_TLB_SHIFT), + get_order(slabs << IO_TLB_SHIFT), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) + return rc; + + i += slabs; + } while (i < nslabs); + return 0; +} + +void __init xen_swiotlb_init(int verbose) +{ + unsigned long bytes; + int rc; + + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from any location. + */ + xen_io_tlb_start = alloc_bootmem(bytes); + if (!xen_io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + xen_io_tlb_end = xen_io_tlb_start + bytes; + /* + * And replace that memory with pages under 4GB. + */ + rc = xen_swiotlb_fixup(xen_io_tlb_start, + bytes, + xen_io_tlb_nslabs); + if (rc) + goto error; + + start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); + swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + + return; +error: + panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ + "We either don't have the permission or you do not have enough"\ + "free memory under 4GB!\n", rc); +} + +void * +xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); + unsigned long vstart; + + /* + * Ignore region specifiers - the kernel's ideas of + * pseudo-phys memory layout has nothing to do with the + * machine physical layout. We can't allocate highmem + * because we can't return a pointer to it. + */ + flags &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) + return ret; + + vstart = __get_free_pages(flags, order); + ret = (void *)vstart; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = dma_alloc_coherent_mask(hwdev, flags); + + if (ret) { + if (xen_create_contiguous_region(vstart, order, + fls64(dma_mask)) != 0) { + free_pages(vstart, order); + return NULL; + } + memset(ret, 0, size); + *dma_handle = virt_to_machine(ret).maddr; + } + return ret; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); + +void +xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dev_addr) +{ + int order = get_order(size); + + if (dma_release_from_coherent(hwdev, order, vaddr)) + return; + + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); + + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + */ +dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = xen_phys_to_bus(phys); + void *map; + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && + !range_straddles_page_boundary(phys, size) && !swiotlb_force) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); + if (!map) + return DMA_ERROR_CODE; + + dev_addr = xen_virt_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous xen_swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + xen_unmap_single(hwdev, dev_addr, size, dir); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu); + +void +xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device); + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above xen_swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for xen_swiotlb_map_page are the + * same here. + */ +int +xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = xen_phys_to_bus(paddr); + + if (swiotlb_force || + !dma_capable(hwdev, dev_addr, sg->length) || + range_straddles_page_boundary(paddr, sg->length)) { + void *map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sgl[0].dma_length = 0; + return DMA_ERROR_CODE; + } + sg->dma_address = xen_virt_to_bus(map); + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); + +int +xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); + +void +xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + xen_swiotlb_sync_single(hwdev, sg->dma_address, + sg->dma_length, dir, target); +} + +void +xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu); + +void +xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device); + +int +xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return !dma_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 88a60e0..60f1827 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -7,6 +7,7 @@ * published by the Free Software Foundation. */ +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> @@ -14,6 +15,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> @@ -425,7 +427,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj, return 0; } -static struct sysfs_ops hyp_sysfs_ops = { +static const struct sysfs_ops hyp_sysfs_ops = { .show = hyp_sysfs_show, .store = hyp_sysfs_store, }; diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 5571f5b..8dca685 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o xenbus-objs += xenbus_comms.o xenbus-objs += xenbus_xs.o xenbus-objs += xenbus_probe.o + +xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o +xenbus-objs += $(xenbus-be-objs-y) + +obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 92a1ef8..cdacf92 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,6 +30,7 @@ * IN THE SOFTWARE. */ +#include <linux/slab.h> #include <linux/types.h> #include <linux/vmalloc.h> #include <asm/xen/hypervisor.h> @@ -49,6 +50,8 @@ const char *xenbus_strstate(enum xenbus_state state) [ XenbusStateConnected ] = "Connected", [ XenbusStateClosing ] = "Closing", [ XenbusStateClosed ] = "Closed", + [XenbusStateReconfiguring] = "Reconfiguring", + [XenbusStateReconfigured] = "Reconfigured", }; return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; } @@ -132,17 +135,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); +static void xenbus_switch_fatal(struct xenbus_device *, int, int, + const char *, ...); -/** - * xenbus_switch_state - * @dev: xenbus device - * @state: new state - * - * Advertise in the store a change of the given driver to the given new_state. - * Return 0 on success, or -errno on error. On error, the device will switch - * to XenbusStateClosing, and the error will be saved in the store. - */ -int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +static int +__xenbus_switch_state(struct xenbus_device *dev, + enum xenbus_state state, int depth) { /* We check whether the state is currently set to the given value, and if not, then the state is set. We don't want to unconditionally @@ -151,35 +149,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) to it, as the device will be tearing down, and we don't want to resurrect that directory. - Note that, because of this cached value of our state, this function - will not work inside a Xenstore transaction (something it was - trying to in the past) because dev->state would not get reset if - the transaction was aborted. - + Note that, because of this cached value of our state, this + function will not take a caller's Xenstore transaction + (something it was trying to in the past) because dev->state + would not get reset if the transaction was aborted. */ + struct xenbus_transaction xbt; int current_state; - int err; + int err, abort; if (state == dev->state) return 0; - err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", - ¤t_state); - if (err != 1) +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_switch_fatal(dev, depth, err, "starting transaction"); return 0; + } + + err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state); + if (err != 1) + goto abort; - err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); + err = xenbus_printf(xbt, dev->nodename, "state", "%d", state); if (err) { - if (state != XenbusStateClosing) /* Avoid looping */ - xenbus_dev_fatal(dev, err, "writing new state"); - return err; + xenbus_switch_fatal(dev, depth, err, "writing new state"); + goto abort; } - dev->state = state; + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + xenbus_switch_fatal(dev, depth, err, "ending transaction"); + } else + dev->state = state; return 0; } + +/** + * xenbus_switch_state + * @dev: xenbus device + * @state: new state + * + * Advertise in the store a change of the given driver to the given new_state. + * Return 0 on success, or -errno on error. On error, the device will switch + * to XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +{ + return __xenbus_switch_state(dev, state, 0); +} + EXPORT_SYMBOL_GPL(xenbus_switch_state); int xenbus_frontend_closed(struct xenbus_device *dev) @@ -283,6 +311,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) EXPORT_SYMBOL_GPL(xenbus_dev_fatal); /** + * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps + * avoiding recursion within xenbus_switch_state. + */ +static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); + + if (!depth) + __xenbus_switch_state(dev, XenbusStateClosing, 1); +} + +/** * xenbus_grant_ring * @dev: xenbus device * @ring_mfn: mfn of ring to grant diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index d42e25d..baa65e7 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -45,35 +45,33 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/page.h> +#include <xen/hvm.h> + #include "xenbus_comms.h" #include "xenbus_probe.h" int xen_store_evtchn; -EXPORT_SYMBOL(xen_store_evtchn); +EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; +EXPORT_SYMBOL_GPL(xen_store_interface); + static unsigned long xen_store_mfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); -static void wait_for_devices(struct xenbus_driver *xendrv); - -static int xenbus_probe_frontend(const char *type, const char *name); - -static void xenbus_dev_shutdown(struct device *_dev); - -static int xenbus_dev_suspend(struct device *dev, pm_message_t state); -static int xenbus_dev_resume(struct device *dev); - /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -94,34 +92,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv) return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; } - -static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - - if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) - return -ENOMEM; - - return 0; -} - -/* device/<type>/<id> => <type>-<id> */ -static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) -{ - nodename = strchr(nodename, '/'); - if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { - printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); - return -EINVAL; - } - - strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); - if (!strchr(bus_id, '/')) { - printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); - return -EINVAL; - } - *strchr(bus_id, '/') = '-'; - return 0; -} +EXPORT_SYMBOL_GPL(xenbus_match); static void free_otherend_details(struct xenbus_device *dev) @@ -141,7 +112,30 @@ static void free_otherend_watch(struct xenbus_device *dev) } -int read_otherend_details(struct xenbus_device *xendev, +static int talk_to_otherend(struct xenbus_device *dev) +{ + struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); + + free_otherend_watch(dev); + free_otherend_details(dev); + + return drv->read_otherend_details(dev); +} + + + +static int watch_otherend(struct xenbus_device *dev) +{ + struct xen_bus_type *bus = + container_of(dev->dev.bus, struct xen_bus_type, bus); + + return xenbus_watch_pathfmt(dev, &dev->otherend_watch, + bus->otherend_changed, + "%s/%s", dev->otherend, "state"); +} + + +int xenbus_read_otherend_details(struct xenbus_device *xendev, char *id_node, char *path_node) { int err = xenbus_gather(XBT_NIL, xendev->nodename, @@ -166,39 +160,11 @@ int read_otherend_details(struct xenbus_device *xendev, return 0; } +EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); - -static int read_backend_details(struct xenbus_device *xendev) -{ - return read_otherend_details(xendev, "backend-id", "backend"); -} - -static struct device_attribute xenbus_dev_attrs[] = { - __ATTR_NULL -}; - -/* Bus type for frontend drivers. */ -static struct xen_bus_type xenbus_frontend = { - .root = "device", - .levels = 2, /* device/type/<id> */ - .get_bus_id = frontend_bus_id, - .probe = xenbus_probe_frontend, - .bus = { - .name = "xen", - .match = xenbus_match, - .uevent = xenbus_uevent, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, - - .suspend = xenbus_dev_suspend, - .resume = xenbus_dev_resume, - }, -}; - -static void otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) +void xenbus_otherend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len, + int ignore_on_shutdown) { struct xenbus_device *dev = container_of(watch, struct xenbus_device, otherend_watch); @@ -226,11 +192,7 @@ static void otherend_changed(struct xenbus_watch *watch, * work that can fail e.g., when the rootfs is gone. */ if (system_state > SYSTEM_RUNNING) { - struct xen_bus_type *bus = bus; - bus = container_of(dev->dev.bus, struct xen_bus_type, bus); - /* If we're frontend, drive the state machine to Closed. */ - /* This should cause the backend to release our resources. */ - if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) + if (ignore_on_shutdown && (state == XenbusStateClosing)) xenbus_frontend_closed(dev); return; } @@ -238,25 +200,7 @@ static void otherend_changed(struct xenbus_watch *watch, if (drv->otherend_changed) drv->otherend_changed(dev, state); } - - -static int talk_to_otherend(struct xenbus_device *dev) -{ - struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); - - free_otherend_watch(dev); - free_otherend_details(dev); - - return drv->read_otherend_details(dev); -} - - -static int watch_otherend(struct xenbus_device *dev) -{ - return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed, - "%s/%s", dev->otherend, "state"); -} - +EXPORT_SYMBOL_GPL(xenbus_otherend_changed); int xenbus_dev_probe(struct device *_dev) { @@ -300,8 +244,9 @@ int xenbus_dev_probe(struct device *_dev) fail: xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); xenbus_switch_state(dev, XenbusStateClosed); - return -ENODEV; + return err; } +EXPORT_SYMBOL_GPL(xenbus_dev_probe); int xenbus_dev_remove(struct device *_dev) { @@ -319,8 +264,9 @@ int xenbus_dev_remove(struct device *_dev) xenbus_switch_state(dev, XenbusStateClosed); return 0; } +EXPORT_SYMBOL_GPL(xenbus_dev_remove); -static void xenbus_dev_shutdown(struct device *_dev) +void xenbus_dev_shutdown(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); unsigned long timeout = 5*HZ; @@ -341,6 +287,7 @@ static void xenbus_dev_shutdown(struct device *_dev) out: put_device(&dev->dev); } +EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); int xenbus_register_driver_common(struct xenbus_driver *drv, struct xen_bus_type *bus, @@ -354,25 +301,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv, return driver_register(&drv->driver); } - -int __xenbus_register_frontend(struct xenbus_driver *drv, - struct module *owner, const char *mod_name) -{ - int ret; - - drv->read_otherend_details = read_backend_details; - - ret = xenbus_register_driver_common(drv, &xenbus_frontend, - owner, mod_name); - if (ret) - return ret; - - /* If this driver is loaded as a module wait for devices to attach. */ - wait_for_devices(drv); - - return 0; -} -EXPORT_SYMBOL_GPL(__xenbus_register_frontend); +EXPORT_SYMBOL_GPL(xenbus_register_driver_common); void xenbus_unregister_driver(struct xenbus_driver *drv) { @@ -454,21 +383,21 @@ static ssize_t xendev_show_nodename(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } -DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); +static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); static ssize_t xendev_show_devtype(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } -DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); +static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); static ssize_t xendev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); } -DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); +static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -543,24 +472,7 @@ fail: kfree(xendev); return err; } - -/* device/<typename>/<name> */ -static int xenbus_probe_frontend(const char *type, const char *name) -{ - char *nodename; - int err; - - nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", - xenbus_frontend.root, type, name); - if (!nodename) - return -ENOMEM; - - DPRINTK("%s", nodename); - - err = xenbus_probe_node(&xenbus_frontend, type, nodename); - kfree(nodename); - return err; -} +EXPORT_SYMBOL_GPL(xenbus_probe_node); static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) { @@ -574,10 +486,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) return PTR_ERR(dir); for (i = 0; i < dir_n; i++) { - err = bus->probe(type, dir[i]); + err = bus->probe(bus, type, dir[i]); if (err) break; } + kfree(dir); return err; } @@ -597,9 +510,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus) if (err) break; } + kfree(dir); return err; } +EXPORT_SYMBOL_GPL(xenbus_probe_devices); static unsigned int char_count(const char *str, char c) { @@ -662,32 +577,18 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) } EXPORT_SYMBOL_GPL(xenbus_dev_changed); -static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - DPRINTK(""); - - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); -} - -/* We watch for devices appearing and vanishing. */ -static struct xenbus_watch fe_watch = { - .node = "device", - .callback = frontend_changed, -}; - -static int xenbus_dev_suspend(struct device *dev, pm_message_t state) +int xenbus_dev_suspend(struct device *dev, pm_message_t state) { int err = 0; struct xenbus_driver *drv; - struct xenbus_device *xdev; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); - DPRINTK(""); + DPRINTK("%s", xdev->nodename); if (dev->driver == NULL) return 0; drv = to_xenbus_driver(dev->driver); - xdev = container_of(dev, struct xenbus_device, dev); if (drv->suspend) err = drv->suspend(xdev, state); if (err) @@ -695,21 +596,20 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state) "xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } +EXPORT_SYMBOL_GPL(xenbus_dev_suspend); -static int xenbus_dev_resume(struct device *dev) +int xenbus_dev_resume(struct device *dev) { int err; struct xenbus_driver *drv; - struct xenbus_device *xdev; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); - DPRINTK(""); + DPRINTK("%s", xdev->nodename); if (dev->driver == NULL) return 0; - drv = to_xenbus_driver(dev->driver); - xdev = container_of(dev, struct xenbus_device, dev); - err = talk_to_otherend(xdev); if (err) { printk(KERN_WARNING @@ -740,6 +640,7 @@ static int xenbus_dev_resume(struct device *dev) return 0; } +EXPORT_SYMBOL_GPL(xenbus_dev_resume); /* A flag to determine if xenstored is 'ready' (i.e. has started) */ int xenstored_ready = 0; @@ -766,59 +667,95 @@ EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); void xenbus_probe(struct work_struct *unused) { - BUG_ON((xenstored_ready <= 0)); - - /* Enumerate devices in xenstore and watch for changes. */ - xenbus_probe_devices(&xenbus_frontend); - register_xenbus_watch(&fe_watch); - xenbus_backend_probe_and_watch(); + xenstored_ready = 1; /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } +EXPORT_SYMBOL_GPL(xenbus_probe); -static int __init xenbus_probe_init(void) +static int __init xenbus_probe_initcall(void) +{ + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain() || xen_hvm_domain()) + return 0; + + xenbus_probe(NULL); + return 0; +} + +device_initcall(xenbus_probe_initcall); + +static int __init xenbus_init(void) { int err = 0; + unsigned long page = 0; DPRINTK(""); err = -ENODEV; if (!xen_domain()) - goto out_error; - - /* Register ourselves with the kernel bus subsystem */ - err = bus_register(&xenbus_frontend.bus); - if (err) - goto out_error; - - err = xenbus_backend_bus_register(); - if (err) - goto out_unreg_front; + return err; /* * Domain0 doesn't have a store_evtchn or store_mfn yet. */ if (xen_initial_domain()) { - /* dom0 not yet supported */ + struct evtchn_alloc_unbound alloc_unbound; + + /* Allocate Xenstore page */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) + goto out_error; + + xen_store_mfn = xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); + + /* Next allocate a local port which xenstored can bind to */ + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = 0; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err == -ENOSYS) + goto out_error; + + BUG_ON(err); + xen_store_evtchn = xen_start_info->store_evtchn = + alloc_unbound.port; + + xen_store_interface = mfn_to_virt(xen_store_mfn); } else { - xenstored_ready = 1; - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; + if (xen_hvm_domain()) { + uint64_t v = 0; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + } else { + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + xenstored_ready = 1; + } } - xen_store_interface = mfn_to_virt(xen_store_mfn); /* Initialize the interface to xenstore. */ err = xs_init(); if (err) { printk(KERN_WARNING "XENBUS: Error initializing xenstore comms: %i\n", err); - goto out_unreg_back; + goto out_error; } - if (!xen_initial_domain()) - xenbus_probe(NULL); - #ifdef CONFIG_XEN_COMPAT_XENFS /* * Create xenfs mountpoint in /proc for compatibility with @@ -829,112 +766,13 @@ static int __init xenbus_probe_init(void) return 0; - out_unreg_back: - xenbus_backend_bus_unregister(); - - out_unreg_front: - bus_unregister(&xenbus_frontend.bus); - out_error: + if (page != 0) + free_page(page); + return err; } -postcore_initcall(xenbus_probe_init); +postcore_initcall(xenbus_init); MODULE_LICENSE("GPL"); - -static int is_disconnected_device(struct device *dev, void *data) -{ - struct xenbus_device *xendev = to_xenbus_device(dev); - struct device_driver *drv = data; - struct xenbus_driver *xendrv; - - /* - * A device with no driver will never connect. We care only about - * devices which should currently be in the process of connecting. - */ - if (!dev->driver) - return 0; - - /* Is this search limited to a particular driver? */ - if (drv && (dev->driver != drv)) - return 0; - - xendrv = to_xenbus_driver(dev->driver); - return (xendev->state != XenbusStateConnected || - (xendrv->is_ready && !xendrv->is_ready(xendev))); -} - -static int exists_disconnected_device(struct device_driver *drv) -{ - return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, - is_disconnected_device); -} - -static int print_device_status(struct device *dev, void *data) -{ - struct xenbus_device *xendev = to_xenbus_device(dev); - struct device_driver *drv = data; - - /* Is this operation limited to a particular driver? */ - if (drv && (dev->driver != drv)) - return 0; - - if (!dev->driver) { - /* Information only: is this too noisy? */ - printk(KERN_INFO "XENBUS: Device with no driver: %s\n", - xendev->nodename); - } else if (xendev->state != XenbusStateConnected) { - printk(KERN_WARNING "XENBUS: Timeout connecting " - "to device: %s (state %d)\n", - xendev->nodename, xendev->state); - } - - return 0; -} - -/* We only wait for device setup after most initcalls have run. */ -static int ready_to_wait_for_devices; - -/* - * On a 10 second timeout, wait for all devices currently configured. We need - * to do this to guarantee that the filesystems and / or network devices - * needed for boot are available, before we can allow the boot to proceed. - * - * This needs to be on a late_initcall, to happen after the frontend device - * drivers have been initialised, but before the root fs is mounted. - * - * A possible improvement here would be to have the tools add a per-device - * flag to the store entry, indicating whether it is needed at boot time. - * This would allow people who knew what they were doing to accelerate their - * boot slightly, but of course needs tools or manual intervention to set up - * those flags correctly. - */ -static void wait_for_devices(struct xenbus_driver *xendrv) -{ - unsigned long timeout = jiffies + 10*HZ; - struct device_driver *drv = xendrv ? &xendrv->driver : NULL; - - if (!ready_to_wait_for_devices || !xen_domain()) - return; - - while (exists_disconnected_device(drv)) { - if (time_after(jiffies, timeout)) - break; - schedule_timeout_interruptible(HZ/10); - } - - bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, - print_device_status); -} - -#ifndef MODULE -static int __init boot_wait_for_devices(void) -{ - ready_to_wait_for_devices = 1; - wait_for_devices(NULL); - return 0; -} - -late_initcall(boot_wait_for_devices); -#endif diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 6c5e318..2466581 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -36,26 +36,15 @@ #define XEN_BUS_ID_SIZE 20 -#ifdef CONFIG_XEN_BACKEND -extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); -extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); -extern void xenbus_backend_probe_and_watch(void); -extern int xenbus_backend_bus_register(void); -extern void xenbus_backend_bus_unregister(void); -#else -static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} -static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} -static inline void xenbus_backend_probe_and_watch(void) {} -static inline int xenbus_backend_bus_register(void) { return 0; } -static inline void xenbus_backend_bus_unregister(void) {} -#endif - struct xen_bus_type { char *root; unsigned int levels; int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); - int (*probe)(const char *type, const char *dir); + int (*probe)(struct xen_bus_type *bus, const char *type, + const char *dir); + void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, + unsigned int len); struct bus_type bus; }; @@ -73,4 +62,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus); extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); +extern void xenbus_dev_shutdown(struct device *_dev); + +extern int xenbus_dev_suspend(struct device *dev, pm_message_t state); +extern int xenbus_dev_resume(struct device *dev); + +extern void xenbus_otherend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len, + int ignore_on_shutdown); + +extern int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + #endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 0000000..6cf467b --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,276 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __func__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/features.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s", + typelen, type, domid, devid) >= XEN_BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static int xenbus_uevent_backend(struct device *dev, + struct kobj_uevent_env *env) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + struct xen_bus_type *bus; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); + if (xdev == NULL) + return -ENODEV; + + /* stuff we want to pass to /sbin/hotplug */ + if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) + return -ENOMEM; + + if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename)) + return -ENOMEM; + + if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root)) + return -ENOMEM; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, env); + } + + return 0; +} + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(struct xen_bus_type *bus, + const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(bus, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, + const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void frontend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + xenbus_otherend_changed(watch, vec, len, 0); +} + +static struct device_attribute xenbus_backend_dev_attrs[] = { + __ATTR_NULL +}; + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .otherend_changed = frontend_changed, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .uevent = xenbus_uevent_backend, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, + .dev_attrs = xenbus_backend_dev_attrs, + }, +}; + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return xenbus_read_otherend_details(xendev, "frontend-id", "frontend"); +} + +int xenbus_dev_is_online(struct xenbus_device *dev) +{ + int rc, val; + + rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); + if (rc != 1) + val = 0; /* no online node present */ + + return val; +} +EXPORT_SYMBOL_GPL(xenbus_dev_is_online); + +int __xenbus_register_backend(struct xenbus_driver *drv, + struct module *owner, const char *mod_name) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend, + owner, mod_name); +} +EXPORT_SYMBOL_GPL(__xenbus_register_backend); + +static int backend_probe_and_watch(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + /* Enumerate devices in xenstore and watch for changes. */ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); + + return NOTIFY_DONE; +} + +static int __init xenbus_probe_backend_init(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = backend_probe_and_watch + }; + int err; + + DPRINTK(""); + + /* Register ourselves with the kernel bus subsystem */ + err = bus_register(&xenbus_backend.bus); + if (err) + return err; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(xenbus_probe_backend_init); diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c new file mode 100644 index 0000000..5bcc2d6 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -0,0 +1,294 @@ +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __func__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/io.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/page.h> + +#include <xen/platform_pci.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + + +/* device/<type>/<id> => <type>-<id> */ +static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ + nodename = strchr(nodename, '/'); + if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { + printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); + return -EINVAL; + } + + strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); + if (!strchr(bus_id, '/')) { + printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); + return -EINVAL; + } + *strchr(bus_id, '/') = '-'; + return 0; +} + +/* device/<typename>/<name> */ +static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s", nodename); + + err = xenbus_probe_node(bus, type, nodename); + kfree(nodename); + return err; +} + +static int xenbus_uevent_frontend(struct device *_dev, + struct kobj_uevent_env *env) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + + if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) + return -ENOMEM; + + return 0; +} + + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + xenbus_otherend_changed(watch, vec, len, 1); +} + +static struct device_attribute xenbus_frontend_dev_attrs[] = { + __ATTR_NULL +}; + +static struct xen_bus_type xenbus_frontend = { + .root = "device", + .levels = 2, /* device/type/<id> */ + .get_bus_id = frontend_bus_id, + .probe = xenbus_probe_frontend, + .otherend_changed = backend_changed, + .bus = { + .name = "xen", + .match = xenbus_match, + .uevent = xenbus_uevent_frontend, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, + .dev_attrs = xenbus_frontend_dev_attrs, + + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, + }, +}; + +static void frontend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); +} + + +/* We watch for devices appearing and vanishing. */ +static struct xenbus_watch fe_watch = { + .node = "device", + .callback = frontend_changed, +}; + +static int read_backend_details(struct xenbus_device *xendev) +{ + return xenbus_read_otherend_details(xendev, "backend-id", "backend"); +} + +static int is_device_connecting(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct device_driver *drv = data; + struct xenbus_driver *xendrv; + + /* + * A device with no driver will never connect. We care only about + * devices which should currently be in the process of connecting. + */ + if (!dev->driver) + return 0; + + /* Is this search limited to a particular driver? */ + if (drv && (dev->driver != drv)) + return 0; + + xendrv = to_xenbus_driver(dev->driver); + return (xendev->state < XenbusStateConnected || + (xendev->state == XenbusStateConnected && + xendrv->is_ready && !xendrv->is_ready(xendev))); +} + +static int exists_connecting_device(struct device_driver *drv) +{ + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + is_device_connecting); +} + +static int print_device_status(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct device_driver *drv = data; + + /* Is this operation limited to a particular driver? */ + if (drv && (dev->driver != drv)) + return 0; + + if (!dev->driver) { + /* Information only: is this too noisy? */ + printk(KERN_INFO "XENBUS: Device with no driver: %s\n", + xendev->nodename); + } else if (xendev->state < XenbusStateConnected) { + enum xenbus_state rstate = XenbusStateUnknown; + if (xendev->otherend) + rstate = xenbus_read_driver_state(xendev->otherend); + printk(KERN_WARNING "XENBUS: Timeout connecting " + "to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); + } + + return 0; +} + +/* We only wait for device setup after most initcalls have run. */ +static int ready_to_wait_for_devices; + +/* + * On a 5-minute timeout, wait for all devices currently configured. We need + * to do this to guarantee that the filesystems and / or network devices + * needed for boot are available, before we can allow the boot to proceed. + * + * This needs to be on a late_initcall, to happen after the frontend device + * drivers have been initialised, but before the root fs is mounted. + * + * A possible improvement here would be to have the tools add a per-device + * flag to the store entry, indicating whether it is needed at boot time. + * This would allow people who knew what they were doing to accelerate their + * boot slightly, but of course needs tools or manual intervention to set up + * those flags correctly. + */ +static void wait_for_devices(struct xenbus_driver *xendrv) +{ + unsigned long start = jiffies; + struct device_driver *drv = xendrv ? &xendrv->driver : NULL; + unsigned int seconds_waited = 0; + + if (!ready_to_wait_for_devices || !xen_domain()) + return; + + while (exists_connecting_device(drv)) { + if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { + if (!seconds_waited) + printk(KERN_WARNING "XENBUS: Waiting for " + "devices to initialise: "); + seconds_waited += 5; + printk("%us...", 300 - seconds_waited); + if (seconds_waited == 300) + break; + } + + schedule_timeout_interruptible(HZ/10); + } + + if (seconds_waited) + printk("\n"); + + bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + print_device_status); +} + +int __xenbus_register_frontend(struct xenbus_driver *drv, + struct module *owner, const char *mod_name) +{ + int ret; + + drv->read_otherend_details = read_backend_details; + + ret = xenbus_register_driver_common(drv, &xenbus_frontend, + owner, mod_name); + if (ret) + return ret; + + /* If this driver is loaded as a module wait for devices to attach. */ + wait_for_devices(drv); + + return 0; +} +EXPORT_SYMBOL_GPL(__xenbus_register_frontend); + +static int frontend_probe_and_watch(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + /* Enumerate devices in xenstore and watch for changes. */ + xenbus_probe_devices(&xenbus_frontend); + register_xenbus_watch(&fe_watch); + + return NOTIFY_DONE; +} + + +static int __init xenbus_probe_frontend_init(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = frontend_probe_and_watch + }; + int err; + + DPRINTK(""); + + /* Register ourselves with the kernel bus subsystem */ + err = bus_register(&xenbus_frontend.bus); + if (err) + return err; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(xenbus_probe_frontend_init); + +#ifndef MODULE +static int __init boot_wait_for_devices(void) +{ + if (xen_hvm_domain() && !xen_platform_pci_unplug) + return -ENODEV; + + ready_to_wait_for_devices = 1; + wait_for_devices(NULL); + return 0; +} + +late_initcall(boot_wait_for_devices); +#endif + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 6f91e8c..5534690 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -534,7 +534,7 @@ int xenbus_printf(struct xenbus_transaction t, #define PRINTF_BUFFER_SIZE 4096 char *printf_buffer; - printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH); if (printf_buffer == NULL) return -ENOMEM; diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c index a240b2c..b91f8ff 100644 --- a/drivers/xen/xencomm.c +++ b/drivers/xen/xencomm.c @@ -18,8 +18,8 @@ * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ -#include <linux/gfp.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/page.h> #include <xen/xencomm.h> #include <xen/interface/xen.h> diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index 25275c3..4fde944 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_XENFS) += xenfs.o -xenfs-objs = super.o xenbus.o
\ No newline at end of file +xenfs-y = super.o xenbus.o privcmd.o +xenfs-$(CONFIG_XEN_DOM0) += xenstored.o diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c new file mode 100644 index 0000000..dbd3b16 --- /dev/null +++ b/drivers/xen/xenfs/privcmd.c @@ -0,0 +1,400 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/uaccess.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/xen-ops.h> + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); +#endif + +static long privcmd_ioctl_hypercall(void __user *udata) +{ + struct privcmd_hypercall hypercall; + long ret; + + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) + return -EFAULT; + + ret = privcmd_call(hypercall.op, + hypercall.arg[0], hypercall.arg[1], + hypercall.arg[2], hypercall.arg[3], + hypercall.arg[4]); + + return ret; +} + +static void free_page_list(struct list_head *pages) +{ + struct page *p, *n; + + list_for_each_entry_safe(p, n, pages, lru) + __free_page(p); + + INIT_LIST_HEAD(pages); +} + +/* + * Given an array of items in userspace, return a list of pages + * containing the data. If copying fails, either because of memory + * allocation failure or a problem reading user memory, return an + * error code; its up to the caller to dispose of any partial list. + */ +static int gather_array(struct list_head *pagelist, + unsigned nelem, size_t size, + void __user *data) +{ + unsigned pageidx; + void *pagedata; + int ret; + + if (size > PAGE_SIZE) + return 0; + + pageidx = PAGE_SIZE; + pagedata = NULL; /* quiet, gcc */ + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page = alloc_page(GFP_KERNEL); + + ret = -ENOMEM; + if (page == NULL) + goto fail; + + pagedata = page_address(page); + + list_add_tail(&page->lru, pagelist); + pageidx = 0; + } + + ret = -EFAULT; + if (copy_from_user(pagedata + pageidx, data, size)) + goto fail; + + data += size; + pageidx += size; + } + + ret = 0; + +fail: + return ret; +} + +/* + * Call function "fn" on each element of the array fragmented + * over a list of pages. + */ +static int traverse_pages(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + pagedata = NULL; /* hush, gcc */ + + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + pageidx = 0; + } + + ret = (*fn)(pagedata + pageidx, state); + if (ret) + break; + pageidx += size; + } + + return ret; +} + +struct mmap_mfn_state { + unsigned long va; + struct vm_area_struct *vma; + domid_t domain; +}; + +static int mmap_mfn_range(void *data, void *state) +{ + struct privcmd_mmap_entry *msg = data; + struct mmap_mfn_state *st = state; + struct vm_area_struct *vma = st->vma; + int rc; + + /* Do not allow range to wrap the address space. */ + if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) + return -EINVAL; + + /* Range chunks must be contiguous in va space. */ + if ((msg->va != st->va) || + ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) + return -EINVAL; + + rc = xen_remap_domain_mfn_range(vma, + msg->va & PAGE_MASK, + msg->mfn, msg->npages, + vma->vm_page_prot, + st->domain); + if (rc < 0) + return rc; + + st->va += msg->npages << PAGE_SHIFT; + + return 0; +} + +static long privcmd_ioctl_mmap(void __user *udata) +{ + struct privcmd_mmap mmapcmd; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + LIST_HEAD(pagelist); + struct mmap_mfn_state state; + + if (!xen_initial_domain()) + return -EPERM; + + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) + return -EFAULT; + + rc = gather_array(&pagelist, + mmapcmd.num, sizeof(struct privcmd_mmap_entry), + mmapcmd.entry); + + if (rc || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + { + struct page *page = list_first_entry(&pagelist, + struct page, lru); + struct privcmd_mmap_entry *msg = page_address(page); + + vma = find_vma(mm, msg->va); + rc = -EINVAL; + + if (!vma || (msg->va != vma->vm_start) || + !privcmd_enforce_singleshot_mapping(vma)) + goto out_up; + } + + state.va = vma->vm_start; + state.vma = vma; + state.domain = mmapcmd.dom; + + rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), + &pagelist, + mmap_mfn_range, &state); + + +out_up: + up_write(&mm->mmap_sem); + +out: + free_page_list(&pagelist); + + return rc; +} + +struct mmap_batch_state { + domid_t domain; + unsigned long va; + struct vm_area_struct *vma; + int err; + + xen_pfn_t __user *user; +}; + +static int mmap_batch_fn(void *data, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + + if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain) < 0) { + *mfnp |= 0xf0000000U; + st->err++; + } + st->va += PAGE_SIZE; + + return 0; +} + +static int mmap_return_errors(void *data, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + + return put_user(*mfnp, st->user++); +} + +static struct vm_operations_struct privcmd_vm_ops; + +static long privcmd_ioctl_mmap_batch(void __user *udata) +{ + int ret; + struct privcmd_mmapbatch m; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long nr_pages; + LIST_HEAD(pagelist); + struct mmap_batch_state state; + + if (!xen_initial_domain()) + return -EPERM; + + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; + + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; + + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), + m.arr); + + if (ret || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, m.addr); + ret = -EINVAL; + if (!vma || + vma->vm_ops != &privcmd_vm_ops || + (m.addr != vma->vm_start) || + ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || + !privcmd_enforce_singleshot_mapping(vma)) { + up_write(&mm->mmap_sem); + goto out; + } + + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.err = 0; + + ret = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state); + + up_write(&mm->mmap_sem); + + if (state.err > 0) { + state.user = m.arr; + ret = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, + mmap_return_errors, &state); + } + +out: + free_page_list(&pagelist); + + return ret; +} + +static long privcmd_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + void __user *udata = (void __user *) data; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: + ret = privcmd_ioctl_hypercall(udata); + break; + + case IOCTL_PRIVCMD_MMAP: + ret = privcmd_ioctl_mmap(udata); + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + ret = privcmd_ioctl_mmap_batch(udata); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vma, vma->vm_start, vma->vm_end, + vmf->pgoff, vmf->virtual_address); + + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .fault = privcmd_fault +}; + +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Unsupported for auto-translate guests. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + + /* DONTCOPY is essential for Xen because copy_page_range doesn't know + * how to recreate these mappings */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; + + return 0; +} + +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +{ + return (xchg(&vma->vm_private_data, (void *)1) == NULL); +} +#endif + +const struct file_operations privcmd_file_ops = { + .unlocked_ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, +}; diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 6559e0c..1aa3897 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -13,6 +13,8 @@ #include <linux/fs.h> #include <linux/magic.h> +#include <xen/xen.h> + #include "xenfs.h" #include <asm/xen/hypervisor.h> @@ -20,6 +22,46 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); +static struct inode *xenfs_make_inode(struct super_block *sb, int mode) +{ + struct inode *ret = new_inode(sb); + + if (ret) { + ret->i_mode = mode; + ret->i_uid = ret->i_gid = 0; + ret->i_blocks = 0; + ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; + } + return ret; +} + +static struct dentry *xenfs_create_file(struct super_block *sb, + struct dentry *parent, + const char *name, + const struct file_operations *fops, + void *data, + int mode) +{ + struct dentry *dentry; + struct inode *inode; + + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + inode = xenfs_make_inode(sb, S_IFREG | mode); + if (!inode) { + dput(dentry); + return NULL; + } + + inode->i_fop = fops; + inode->i_private = data; + + d_add(dentry, inode); + return dentry; +} + static ssize_t capabilities_read(struct file *file, char __user *buf, size_t size, loff_t *off) { @@ -33,6 +75,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf, static const struct file_operations capabilities_file_ops = { .read = capabilities_read, + .llseek = default_llseek, }; static int xenfs_fill_super(struct super_block *sb, void *data, int silent) @@ -41,29 +84,42 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) [1] = {}, { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR }, {""}, }; + int rc; + + rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); + if (rc < 0) + return rc; + + if (xen_initial_domain()) { + xenfs_create_file(sb, sb->s_root, "xsd_kva", + &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); + xenfs_create_file(sb, sb->s_root, "xsd_port", + &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); + } - return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); + return rc; } -static int xenfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct vfsmount *mnt) +static struct dentry *xenfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) { - return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt); + return mount_single(fs_type, flags, data, xenfs_fill_super); } static struct file_system_type xenfs_type = { .owner = THIS_MODULE, .name = "xenfs", - .get_sb = xenfs_get_sb, + .mount = xenfs_mount, .kill_sb = kill_litter_super, }; static int __init xenfs_init(void) { - if (xen_pv_domain()) + if (xen_domain()) return register_filesystem(&xenfs_type); printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); @@ -72,7 +128,7 @@ static int __init xenfs_init(void) static void __exit xenfs_exit(void) { - if (xen_pv_domain()) + if (xen_domain()) unregister_filesystem(&xenfs_type); } diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c index 8f6c7d4..bbd000f 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenfs/xenbus.c @@ -43,6 +43,7 @@ #include <linux/fs.h> #include <linux/poll.h> #include <linux/mutex.h> +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/mount.h> #include <linux/pagemap.h> @@ -50,6 +51,7 @@ #include <linux/init.h> #include <linux/namei.h> #include <linux/string.h> +#include <linux/slab.h> #include "xenfs.h" #include "../xenbus/xenbus_comms.h" @@ -587,4 +589,5 @@ const struct file_operations xenbus_file_ops = { .open = xenbus_file_open, .release = xenbus_file_release, .poll = xenbus_file_poll, + .llseek = no_llseek, }; diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 51f08b2..b68aa62 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -2,5 +2,8 @@ #define _XENFS_XENBUS_H extern const struct file_operations xenbus_file_ops; +extern const struct file_operations privcmd_file_ops; +extern const struct file_operations xsd_kva_file_ops; +extern const struct file_operations xsd_port_file_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c new file mode 100644 index 0000000..fef20db --- /dev/null +++ b/drivers/xen/xenfs/xenstored.c @@ -0,0 +1,68 @@ +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> + +#include <xen/page.h> + +#include "xenfs.h" +#include "../xenbus/xenbus_comms.h" + +static ssize_t xsd_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + const char *str = (const char *)file->private_data; + return simple_read_from_buffer(buf, size, off, str, strlen(str)); +} + +static int xsd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static int xsd_kva_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p", + xen_store_interface); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) + return -EINVAL; + + if (remap_pfn_range(vma, vma->vm_start, + virt_to_pfn(xen_store_interface), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +const struct file_operations xsd_kva_file_ops = { + .open = xsd_kva_open, + .mmap = xsd_kva_mmap, + .read = xsd_read, + .release = xsd_release, +}; + +static int xsd_port_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "%d", + xen_store_evtchn); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +const struct file_operations xsd_port_file_ops = { + .open = xsd_port_open, + .read = xsd_read, + .release = xsd_release, +}; |