diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r-- | arch/powerpc/platforms/pseries/cmm.c | 14 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/dlpar.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/hotplug-memory.c | 66 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/iommu.c | 589 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/msi.c | 18 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/nvram.c | 279 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/offline_states.h | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/pci_dlpar.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/setup.c | 21 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/smp.c | 21 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/xics.c | 140 |
13 files changed, 984 insertions, 176 deletions
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index f480386..3cafc30 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -508,12 +508,7 @@ static int cmm_memory_isolate_cb(struct notifier_block *self, if (action == MEM_ISOLATE_COUNT) ret = cmm_count_pages(arg); - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_isolate_nb = { @@ -635,12 +630,7 @@ static int cmm_memory_cb(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_nb = { diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index b74a923..57ceb92 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -74,7 +74,7 @@ static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) return NULL; /* The configure connector reported name does not contain a - * preceeding '/', so we allocate a buffer large enough to + * preceding '/', so we allocate a buffer large enough to * prepend this to the full_name. */ name = (char *)ccwa + ccwa->name_offset; diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 17a11c8..8964917 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -65,7 +65,7 @@ * with EEH. * * Ideally, a PCI device driver, when suspecting that an isolation - * event has occured (e.g. by reading 0xff's), will then ask EEH + * event has occurred (e.g. by reading 0xff's), will then ask EEH * whether this is the case, and then take appropriate steps to * reset the PCI slot, the PCI device, and then resume operations. * However, until that day, the checking is done here, with the @@ -876,7 +876,7 @@ void eeh_restore_bars(struct pci_dn *pdn) * * Save the values of the device bars. Unlike the restore * routine, this routine is *not* recursive. This is because - * PCI devices are added individuallly; but, for the restore, + * PCI devices are added individually; but, for the restore, * an entire slot is reset at a time. */ static void eeh_save_bars(struct pci_dn *pdn) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fd50ccd..ef8c454 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -216,7 +216,7 @@ static void pseries_cpu_die(unsigned int cpu) cpu, pcpu, cpu_status); } - /* Isolation and deallocation are definatly done by + /* Isolation and deallocation are definitely done by * drslot_chrp_cpu. If they were not they would be * done here. Change isolate state to Isolate and * change allocation-state to Unusable. diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index bc88036..33867ec 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -17,6 +17,54 @@ #include <asm/pSeries_reconfig.h> #include <asm/sparsemem.h> +static unsigned long get_memblock_size(void) +{ + struct device_node *np; + unsigned int memblock_size = 0; + + np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (np) { + const unsigned long *size; + + size = of_get_property(np, "ibm,lmb-size", NULL); + memblock_size = size ? *size : 0; + + of_node_put(np); + } else { + unsigned int memzero_size = 0; + const unsigned int *regs; + + np = of_find_node_by_path("/memory@0"); + if (np) { + regs = of_get_property(np, "reg", NULL); + memzero_size = regs ? regs[3] : 0; + of_node_put(np); + } + + if (memzero_size) { + /* We now know the size of memory@0, use this to find + * the first memoryblock and get its size. + */ + char buf[64]; + + sprintf(buf, "/memory@%x", memzero_size); + np = of_find_node_by_path(buf); + if (np) { + regs = of_get_property(np, "reg", NULL); + memblock_size = regs ? regs[3] : 0; + of_node_put(np); + } + } + } + + return memblock_size; +} + +unsigned long memory_block_size_bytes(void) +{ + return get_memblock_size(); +} + static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) { unsigned long start, start_pfn; @@ -127,30 +175,22 @@ static int pseries_add_memory(struct device_node *np) static int pseries_drconf_memory(unsigned long *base, unsigned int action) { - struct device_node *np; - const unsigned long *lmb_size; + unsigned long memblock_size; int rc; - np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (!np) + memblock_size = get_memblock_size(); + if (!memblock_size) return -EINVAL; - lmb_size = of_get_property(np, "ibm,lmb-size", NULL); - if (!lmb_size) { - of_node_put(np); - return -EINVAL; - } - if (action == PSERIES_DRCONF_MEM_ADD) { - rc = memblock_add(*base, *lmb_size); + rc = memblock_add(*base, memblock_size); rc = (rc < 0) ? -EINVAL : 0; } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_memblock(*base, *lmb_size); + rc = pseries_remove_memblock(*base, memblock_size); } else { rc = -EINVAL; } - of_node_put(np); return rc; } diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index edea60b..6d5412a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -33,6 +33,7 @@ #include <linux/pci.h> #include <linux/dma-mapping.h> #include <linux/crash_dump.h> +#include <linux/memory.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -45,6 +46,7 @@ #include <asm/tce.h> #include <asm/ppc-pci.h> #include <asm/udbg.h> +#include <asm/mmzone.h> #include "plpar_wrappers.h" @@ -270,6 +272,152 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) return tce_ret; } +/* this is compatible with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; +}; + +struct ddw_create_response { + u32 liobn; + u32 addr_hi; + u32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -495,6 +643,329 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) pci_name(dev)); } +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddr_avail; + u64 liobn; + int len, ret; + + ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64 || !ddr_avail || len < 3 * sizeof(u32)) + return; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); +} + + +static int dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = direct64->dma_base; + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + int len; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (direct64) { + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) { + remove_ddw(pdn); + } else { + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + dma_addr = direct64->dma_base; + } + } + + return dma_addr; +} + +static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_query_response *query) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddr_avail[0], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1], + cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddr_avail); + struct direct_window *window; + struct property *uninitialized_var(win64); + struct dynamic_dma_window_prop *ddwprop; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = dupe_ddw_if_already_created(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + dma_addr = dupe_ddw_if_kexec(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddr_avail || len < 3 * sizeof(u32)) + goto out_unlock; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddr_avail, &query); + if (ret != 0) + goto out_unlock; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_unlock; + } + if (query.page_size & 4) { + page_shift = 24; /* 16MB */ + } else if (query.page_size & 2) { + page_shift = 16; /* 64kB */ + } else if (query.page_size & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_unlock; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (query.largest_available_block < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_unlock; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_unlock; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddr_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_clear_window; + } + + ret = prom_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_clear_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; @@ -505,7 +976,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); /* dev setup for LPAR is a little tricky, since the device tree might - * contain the dma-window properties per-device and not neccesarily + * contain the dma-window properties per-device and not necessarily * for the bus. So we need to search upwards in the tree until we * either hit a dma-window property, OR find a parent with a table * already allocated. @@ -541,23 +1012,137 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) set_iommu_table_base(&dev->dev, pci->iommu_table); } + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const void *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + pdev = to_pci_dev(dev); + + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall-through to iommu ops */ + if (!ddw_enabled) { + dev_info(dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + } + + *dev->dma_mask = dma_mask; + return 0; +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); + + /* + * Because the notifier runs after isolation of the + * slot, we are guaranteed any DMA window has already + * been revoked and the TCEs have been marked invalid, + * so we don't need a call to remove_ddw(np). However, + * if an additional notifier action is added before the + * isolate call, we should update this code for + * completeness with such a call. + */ break; default: err = NOTIFY_DONE; @@ -587,6 +1172,7 @@ void iommu_init_early_pSeries(void) ppc_md.tce_get = tce_get_pSeriesLP; ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; @@ -597,6 +1183,7 @@ void iommu_init_early_pSeries(void) pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 1164c34..38d24e7 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -93,8 +93,18 @@ static void rtas_disable_msi(struct pci_dev *pdev) if (!pdn) return; - if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) - pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } } static int rtas_query_irq_number(struct pci_dn *pdn, int offset) @@ -127,7 +137,7 @@ static void rtas_teardown_msi_irqs(struct pci_dev *pdev) if (entry->irq == NO_IRQ) continue; - set_irq_msi(entry->irq, NULL); + irq_set_msi_desc(entry->irq, NULL); irq_dispose_mapping(entry->irq); } @@ -427,7 +437,7 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) } dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - set_irq_msi(virq, entry); + irq_set_msi_desc(virq, entry); /* Read config space back so we can restore after reset */ read_msi_msg(virq, &msg); diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 7e828ba..00cc3a0 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -16,6 +16,8 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/kmsg_dump.h> #include <asm/uaccess.h> #include <asm/nvram.h> #include <asm/rtas.h> @@ -30,17 +32,54 @@ static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); -static long nvram_error_log_index = -1; -static long nvram_error_log_size = 0; - struct err_log_info { int error_type; unsigned int seq_num; }; -#define NVRAM_MAX_REQ 2079 -#define NVRAM_MIN_REQ 1055 -#define NVRAM_LOG_PART_NAME "ibm,rtas-log" +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1 +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1 +}; + +static const char *pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ +static char *oops_buf; static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -134,7 +173,7 @@ static ssize_t pSeries_nvram_get_size(void) } -/* nvram_write_error_log +/* nvram_write_os_partition, nvram_write_error_log * * We need to buffer the error logs into nvram to ensure that we have * the failure information to decode. If we have a severe error there @@ -156,48 +195,58 @@ static ssize_t pSeries_nvram_get_size(void) * The 'data' section would look like (in bytes): * +--------------+------------+-----------------------------------+ * | event_logged | sequence # | error log | - * |0 3|4 7|8 nvram_error_log_size-1| + * |0 3|4 7|8 error_log_size-1| * +--------------+------------+-----------------------------------+ * * event_logged: 0 if event has not been logged to syslog, 1 if it has * sequence #: The unique sequence # for each event. (until it wraps) * error log: The error log from event_scan */ -int nvram_write_error_log(char * buff, int length, - unsigned int err_type, unsigned int error_log_cnt) +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) { + if (part->index == -1) { return -ESPIPE; } - if (length > nvram_error_log_size) { - length = nvram_error_log_size; + if (length > part->size) { + length = part->size; } info.error_type = err_type; info.seq_num = error_log_cnt; - tmp_index = nvram_error_log_index; + tmp_index = part->index; rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); return rc; } rc = ppc_md.nvram_write(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); return rc; } return 0; } +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) + last_unread_rtas_event = get_seconds(); + return rc; +} + /* nvram_read_error_log * * Reads nvram for error log for at most 'length' @@ -209,13 +258,13 @@ int nvram_read_error_log(char * buff, int length, loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) + if (rtas_log_partition.index == -1) return -1; - if (length > nvram_error_log_size) - length = nvram_error_log_size; + if (length > rtas_log_partition.size) + length = rtas_log_partition.size; - tmp_index = nvram_error_log_index; + tmp_index = rtas_log_partition.index; rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { @@ -244,37 +293,40 @@ int nvram_clear_error_log(void) int clear_word = ERR_FLAG_ALREADY_LOGGED; int rc; - if (nvram_error_log_index == -1) + if (rtas_log_partition.index == -1) return -1; - tmp_index = nvram_error_log_index; + tmp_index = rtas_log_partition.index; rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); if (rc <= 0) { printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); return rc; } + last_unread_rtas_event = 0; return 0; } -/* pseries_nvram_init_log_partition +/* pseries_nvram_init_os_partition * - * This will setup the partition we need for buffering the - * error logs and cleanup partitions if needed. + * This sets up a partition with an "OS" signature. * * The general strategy is the following: - * 1.) If there is log partition large enough then use it. - * 2.) If there is none large enough, search - * for a free partition that is large enough. - * 3.) If there is not a free partition large enough remove - * _all_ OS partitions and consolidate the space. - * 4.) Will first try getting a chunk that will satisfy the maximum - * error log size (NVRAM_MAX_REQ). - * 5.) If the max chunk cannot be allocated then try finding a chunk - * that will satisfy the minum needed (NVRAM_MIN_REQ). + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minum needed. + * + * Returns 0 on success, else -1. */ -static int __init pseries_nvram_init_log_partition(void) +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) { loff_t p; int size; @@ -282,47 +334,76 @@ static int __init pseries_nvram_init_log_partition(void) /* Scan nvram for partitions */ nvram_scan_partitions(); - /* Lookg for ours */ - p = nvram_find_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, &size); + /* Look for ours */ + p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); /* Found one but too small, remove it */ - if (p && size < NVRAM_MIN_REQ) { - pr_info("nvram: Found too small "NVRAM_LOG_PART_NAME" partition" - ",removing it..."); - nvram_remove_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS); + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); p = 0; } /* Create one if we didn't find */ if (!p) { - p = nvram_create_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, - NVRAM_MAX_REQ, NVRAM_MIN_REQ); - /* No room for it, try to get rid of any OS partition - * and try again - */ + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); if (p == -ENOSPC) { - pr_info("nvram: No room to create "NVRAM_LOG_PART_NAME - " partition, deleting all OS partitions..."); - nvram_remove_partition(NULL, NVRAM_SIG_OS); - p = nvram_create_partition(NVRAM_LOG_PART_NAME, - NVRAM_SIG_OS, NVRAM_MAX_REQ, - NVRAM_MIN_REQ); + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); } } if (p <= 0) { - pr_err("nvram: Failed to find or create "NVRAM_LOG_PART_NAME - " partition, err %d\n", (int)p); - return 0; + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; } - nvram_error_log_index = p; - nvram_error_log_size = nvram_get_partition_size(p) - - sizeof(struct err_log_info); + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); return 0; } -machine_arch_initcall(pseries, pseries_nvram_init_log_partition); + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + return; + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); int __init pSeries_nvram_init(void) { @@ -353,3 +434,83 @@ int __init pSeries_nvram_init(void) return 0; } + +/* + * Try to capture the last capture_len bytes of the printk buffer. Return + * the amount actually captured. + */ +static size_t capture_last_msgs(const char *old_msgs, size_t old_len, + const char *new_msgs, size_t new_len, + char *captured, size_t capture_len) +{ + if (new_len >= capture_len) { + memcpy(captured, new_msgs + (new_len - capture_len), + capture_len); + return capture_len; + } else { + /* Grab the end of old_msgs. */ + size_t old_tail_len = min(old_len, capture_len - new_len); + memcpy(captured, old_msgs + (old_len - old_tail_len), + old_tail_len); + memcpy(captured + old_tail_len, new_msgs, new_len); + return old_tail_len + new_len; + } +} + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +/* our kmsg_dump callback */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len) +{ + static unsigned int oops_count = 0; + static bool panicking = false; + size_t text_len; + + switch (reason) { + case KMSG_DUMP_RESTART: + case KMSG_DUMP_HALT: + case KMSG_DUMP_POWEROFF: + /* These are almost always orderly shutdowns. */ + return; + case KMSG_DUMP_OOPS: + case KMSG_DUMP_KEXEC: + break; + case KMSG_DUMP_PANIC: + panicking = true; + break; + case KMSG_DUMP_EMERG: + if (panicking) + /* Panic report already captured. */ + return; + break; + default: + pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n", + __FUNCTION__, (int) reason); + return; + } + + if (clobbering_unread_rtas_event()) + return; + + text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, + oops_buf, oops_log_partition.size); + (void) nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count); +} diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h index 75a6f48..08672d9 100644 --- a/arch/powerpc/platforms/pseries/offline_states.h +++ b/arch/powerpc/platforms/pseries/offline_states.h @@ -34,6 +34,4 @@ static inline void set_default_offline_state(int cpu) #endif extern enum cpu_state_vals get_preferred_offline_state(int cpu); -extern int start_secondary(void); -extern void start_secondary_resume(void); #endif diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 5fcc92a..3bf4488 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -149,7 +149,7 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) if (dn->child) eeh_add_device_tree_early(dn); - pcibios_scan_phb(phb, dn); + pcibios_scan_phb(phb); pcibios_finish_adding_to_bus(phb->bus); return phb; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index d345bfd..6c42cfd 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -114,10 +114,13 @@ static void __init fwnmi_init(void) static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = irq_desc_get_chip(desc); unsigned int cascade_irq = i8259_irq(); + if (cascade_irq != NO_IRQ) generic_handle_irq(cascade_irq); - desc->chip->eoi(irq); + + chip->irq_eoi(&desc->irq_data); } static void __init pseries_setup_i8259_cascade(void) @@ -166,7 +169,7 @@ static void __init pseries_setup_i8259_cascade(void) printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack); i8259_init(found, intack); of_node_put(found); - set_irq_chained_handler(cascade, pseries_8259_cascade); + irq_set_chained_handler(cascade, pseries_8259_cascade); } static void __init pseries_mpic_init_IRQ(void) @@ -284,14 +287,22 @@ static int alloc_dispatch_logs(void) int cpu, ret; struct paca_struct *pp; struct dtl_entry *dtl; + struct kmem_cache *dtl_cache; if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, NULL); + if (!dtl_cache) { + pr_warn("Failed to create dispatch trace log buffer cache\n"); + pr_warn("Stolen time statistics will be unreliable\n"); + return 0; + } + for_each_possible_cpu(cpu) { pp = &paca[cpu]; - dtl = kmalloc_node(DISPATCH_LOG_BYTES, GFP_KERNEL, - cpu_to_node(cpu)); + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); if (!dtl) { pr_warn("Failed to allocate dispatch trace log for cpu %d\n", cpu); @@ -375,7 +386,7 @@ static int __init pSeries_init_panel(void) return 0; } -arch_initcall(pSeries_init_panel); +machine_arch_initcall(pseries, pSeries_init_panel); static int pseries_set_dabr(unsigned long dabr) { diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 0317cce..a509c52 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -64,8 +64,8 @@ int smp_query_cpu_stopped(unsigned int pcpu) int qcss_tok = rtas_token("query-cpu-stopped-state"); if (qcss_tok == RTAS_UNKNOWN_SERVICE) { - printk(KERN_INFO "Firmware doesn't support " - "query-cpu-stopped-state\n"); + printk_once(KERN_INFO + "Firmware doesn't support query-cpu-stopped-state\n"); return QCSS_HARDWARE_ERROR; } @@ -112,10 +112,10 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca[lcpu].__current)->preempt_count = 0; - +#ifdef CONFIG_HOTPLUG_CPU if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) goto out; - +#endif /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -130,7 +130,9 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) return 0; } +#ifdef CONFIG_HOTPLUG_CPU out: +#endif return 1; } @@ -144,16 +146,15 @@ static void __devinit smp_xics_setup_cpu(int cpu) vpa_init(cpu); cpumask_clear_cpu(cpu, of_spin_mask); +#ifdef CONFIG_HOTPLUG_CPU set_cpu_current_state(cpu, CPU_STATE_ONLINE); set_default_offline_state(cpu); - +#endif } #endif /* CONFIG_XICS */ static void __devinit smp_pSeries_kick_cpu(int nr) { - long rc; - unsigned long hcpuid; BUG_ON(nr < 0 || nr >= NR_CPUS); if (!smp_startup_cpu(nr)) @@ -165,16 +166,20 @@ static void __devinit smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca[nr].cpu_start = 1; - +#ifdef CONFIG_HOTPLUG_CPU set_preferred_offline_state(nr, CPU_STATE_ONLINE); if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + long rc; + unsigned long hcpuid; + hcpuid = get_hard_smp_processor_id(nr); rc = plpar_hcall_norets(H_PROD, hcpuid); if (rc != H_SUCCESS) printk(KERN_ERR "Error: Prod to wake up processor %d " "Ret= %ld\n", nr, rc); } +#endif } static int smp_pSeries_cpu_bootable(unsigned int nr) diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index 7b96e5a..d690133 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -202,88 +202,88 @@ static int get_irq_server(unsigned int virq, const struct cpumask *cpumask, #define get_irq_server(virq, cpumask, strict_check) (default_server) #endif -static void xics_unmask_irq(unsigned int virq) +static void xics_unmask_irq(struct irq_data *d) { - unsigned int irq; + unsigned int hwirq; int call_status; int server; - pr_devel("xics: unmask virq %d\n", virq); + pr_devel("xics: unmask virq %d\n", d->irq); - irq = (unsigned int)irq_map[virq].hwirq; - pr_devel(" -> map to hwirq 0x%x\n", irq); - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) + hwirq = (unsigned int)irq_map[d->irq].hwirq; + pr_devel(" -> map to hwirq 0x%x\n", hwirq); + if (hwirq == XICS_IPI || hwirq == XICS_IRQ_SPURIOUS) return; - server = get_irq_server(virq, irq_to_desc(virq)->affinity, 0); + server = get_irq_server(d->irq, d->affinity, 0); - call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, + call_status = rtas_call(ibm_set_xive, 3, 1, NULL, hwirq, server, DEFAULT_PRIORITY); if (call_status != 0) { printk(KERN_ERR "%s: ibm_set_xive irq %u server %x returned %d\n", - __func__, irq, server, call_status); + __func__, hwirq, server, call_status); return; } /* Now unmask the interrupt (often a no-op) */ - call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq); + call_status = rtas_call(ibm_int_on, 1, 1, NULL, hwirq); if (call_status != 0) { printk(KERN_ERR "%s: ibm_int_on irq=%u returned %d\n", - __func__, irq, call_status); + __func__, hwirq, call_status); return; } } -static unsigned int xics_startup(unsigned int virq) +static unsigned int xics_startup(struct irq_data *d) { /* * The generic MSI code returns with the interrupt disabled on the * card, using the MSI mask bits. Firmware doesn't appear to unmask * at that level, so we do it here by hand. */ - if (irq_to_desc(virq)->msi_desc) - unmask_msi_irq(irq_get_irq_data(virq)); + if (d->msi_desc) + unmask_msi_irq(d); /* unmask it */ - xics_unmask_irq(virq); + xics_unmask_irq(d); return 0; } -static void xics_mask_real_irq(unsigned int irq) +static void xics_mask_real_irq(unsigned int hwirq) { int call_status; - if (irq == XICS_IPI) + if (hwirq == XICS_IPI) return; - call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq); + call_status = rtas_call(ibm_int_off, 1, 1, NULL, hwirq); if (call_status != 0) { printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n", - __func__, irq, call_status); + __func__, hwirq, call_status); return; } /* Have to set XIVE to 0xff to be able to remove a slot */ - call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, + call_status = rtas_call(ibm_set_xive, 3, 1, NULL, hwirq, default_server, 0xff); if (call_status != 0) { printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n", - __func__, irq, call_status); + __func__, hwirq, call_status); return; } } -static void xics_mask_irq(unsigned int virq) +static void xics_mask_irq(struct irq_data *d) { - unsigned int irq; + unsigned int hwirq; - pr_devel("xics: mask virq %d\n", virq); + pr_devel("xics: mask virq %d\n", d->irq); - irq = (unsigned int)irq_map[virq].hwirq; - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) + hwirq = (unsigned int)irq_map[d->irq].hwirq; + if (hwirq == XICS_IPI || hwirq == XICS_IRQ_SPURIOUS) return; - xics_mask_real_irq(irq); + xics_mask_real_irq(hwirq); } static void xics_mask_unknown_vec(unsigned int vec) @@ -371,57 +371,58 @@ static unsigned char pop_cppr(void) return os_cppr->stack[--os_cppr->index]; } -static void xics_eoi_direct(unsigned int virq) +static void xics_eoi_direct(struct irq_data *d) { - unsigned int irq = (unsigned int)irq_map[virq].hwirq; + unsigned int hwirq = (unsigned int)irq_map[d->irq].hwirq; iosync(); - direct_xirr_info_set((pop_cppr() << 24) | irq); + direct_xirr_info_set((pop_cppr() << 24) | hwirq); } -static void xics_eoi_lpar(unsigned int virq) +static void xics_eoi_lpar(struct irq_data *d) { - unsigned int irq = (unsigned int)irq_map[virq].hwirq; + unsigned int hwirq = (unsigned int)irq_map[d->irq].hwirq; iosync(); - lpar_xirr_info_set((pop_cppr() << 24) | irq); + lpar_xirr_info_set((pop_cppr() << 24) | hwirq); } -static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) +static int +xics_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool force) { - unsigned int irq; + unsigned int hwirq; int status; int xics_status[2]; int irq_server; - irq = (unsigned int)irq_map[virq].hwirq; - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) + hwirq = (unsigned int)irq_map[d->irq].hwirq; + if (hwirq == XICS_IPI || hwirq == XICS_IRQ_SPURIOUS) return -1; - status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); + status = rtas_call(ibm_get_xive, 1, 3, xics_status, hwirq); if (status) { printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", - __func__, irq, status); + __func__, hwirq, status); return -1; } - irq_server = get_irq_server(virq, cpumask, 1); + irq_server = get_irq_server(d->irq, cpumask, 1); if (irq_server == -1) { char cpulist[128]; cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); printk(KERN_WARNING "%s: No online cpus in the mask %s for irq %d\n", - __func__, cpulist, virq); + __func__, cpulist, d->irq); return -1; } status = rtas_call(ibm_set_xive, 3, 1, NULL, - irq, irq_server, xics_status[1]); + hwirq, irq_server, xics_status[1]); if (status) { printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", - __func__, irq, status); + __func__, hwirq, status); return -1; } @@ -430,20 +431,20 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) static struct irq_chip xics_pic_direct = { .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_direct, - .set_affinity = xics_set_affinity + .irq_startup = xics_startup, + .irq_mask = xics_mask_irq, + .irq_unmask = xics_unmask_irq, + .irq_eoi = xics_eoi_direct, + .irq_set_affinity = xics_set_affinity }; static struct irq_chip xics_pic_lpar = { .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_lpar, - .set_affinity = xics_set_affinity + .irq_startup = xics_startup, + .irq_mask = xics_mask_irq, + .irq_unmask = xics_unmask_irq, + .irq_eoi = xics_eoi_lpar, + .irq_set_affinity = xics_set_affinity }; @@ -469,8 +470,8 @@ static int xics_host_map(struct irq_host *h, unsigned int virq, /* Insert the interrupt mapping into the radix tree for fast lookup */ irq_radix_revmap_insert(xics_host, virq, hw); - irq_to_desc(virq)->status |= IRQ_LEVEL; - set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq); + irq_set_status_flags(virq, IRQ_LEVEL); + irq_set_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq); return 0; } @@ -599,7 +600,7 @@ static void xics_request_ipi(void) * IPIs are marked IRQF_DISABLED as they must run with irqs * disabled */ - set_irq_handler(ipi, handle_percpu_irq); + irq_set_handler(ipi, handle_percpu_irq); if (firmware_has_feature(FW_FEATURE_LPAR)) rc = request_irq(ipi, xics_ipi_action_lpar, IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL); @@ -873,7 +874,7 @@ void xics_kexec_teardown_cpu(int secondary) void xics_migrate_irqs_away(void) { int cpu = smp_processor_id(), hw_cpu = hard_smp_processor_id(); - unsigned int irq, virq; + int virq; /* If we used to be the default server, move to the new "boot_cpuid" */ if (hw_cpu == default_server) @@ -890,33 +891,38 @@ void xics_migrate_irqs_away(void) for_each_irq(virq) { struct irq_desc *desc; + struct irq_chip *chip; + unsigned int hwirq; int xics_status[2]; int status; unsigned long flags; - /* We cant set affinity on ISA interrupts */ + /* We can't set affinity on ISA interrupts */ if (virq < NUM_ISA_INTERRUPTS) continue; if (irq_map[virq].host != xics_host) continue; - irq = (unsigned int)irq_map[virq].hwirq; + hwirq = (unsigned int)irq_map[virq].hwirq; /* We need to get IPIs still. */ - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) + if (hwirq == XICS_IPI || hwirq == XICS_IRQ_SPURIOUS) continue; + desc = irq_to_desc(virq); /* We only need to migrate enabled IRQS */ - if (desc == NULL || desc->chip == NULL - || desc->action == NULL - || desc->chip->set_affinity == NULL) + if (desc == NULL || desc->action == NULL) + continue; + + chip = irq_desc_get_chip(desc); + if (chip == NULL || chip->irq_set_affinity == NULL) continue; raw_spin_lock_irqsave(&desc->lock, flags); - status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); + status = rtas_call(ibm_get_xive, 1, 3, xics_status, hwirq); if (status) { printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", - __func__, irq, status); + __func__, hwirq, status); goto unlock; } @@ -934,8 +940,8 @@ void xics_migrate_irqs_away(void) virq, cpu); /* Reset affinity to all cpus */ - cpumask_setall(irq_to_desc(virq)->affinity); - desc->chip->set_affinity(virq, cpu_all_mask); + cpumask_setall(desc->irq_data.affinity); + chip->irq_set_affinity(&desc->irq_data, cpu_all_mask, true); unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); } |