diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-20 13:45:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-20 13:45:53 -0700 |
commit | ede13d81b4dda409a6d271b34b8e2ec9383e255d (patch) | |
tree | 2e32142d5a8e076c64f1871f5ad162fffff5f357 /arch | |
parent | 2008220879af095d00ca27eb168a55c8595fbc0b (diff) | |
parent | 486acd4850dde6d2f8c7f431432f3914c4bfb5f5 (diff) | |
download | op-kernel-dev-ede13d81b4dda409a6d271b34b8e2ec9383e255d.zip op-kernel-dev-ede13d81b4dda409a6d271b34b8e2ec9383e255d.tar.gz |
Merge branch 'for-2.6.23' of master.kernel.org:/pub/scm/linux/kernel/git/arnd/cell-2.6
* 'for-2.6.23' of master.kernel.org:/pub/scm/linux/kernel/git/arnd/cell-2.6: (37 commits)
[CELL] spufs: rework list management and associated locking
[CELL] oprofile: add support to OProfile for profiling CELL BE SPUs
[CELL] oprofile: enable SPU switch notification to detect currently active SPU tasks
[CELL] spu_base: locking cleanup
[CELL] cell: indexing of SPUs based on firmware vicinity properties
[CELL] spufs: integration of SPE affinity with the scheduller
[CELL] cell: add placement computation for scheduling of affinity contexts
[CELL] spufs: extension of spu_create to support affinity definition
[CELL] cell: add hardcoded spu vicinity information for QS20
[CELL] cell: add vicinity information on spus
[CELL] cell: add per BE structure with info about its SPUs
[CELL] spufs: use find_first_bit() instead of sched_find_first_bit()
[CELL] spufs: remove unused file argument from spufs_run_spu()
[CELL] spufs: change decrementer restore timing
[CELL] spufs: dont halt decrementer at restore step 47
[CELL] spufs: limit saving MFC_CNTL bits
[CELL] spufs: fix read and write for decr_status file
[CELL] spufs: fix decr_status meanings
[CELL] spufs: remove needless context save/restore code
[CELL] spufs: fix array size of channel index
...
Diffstat (limited to 'arch')
44 files changed, 4166 insertions, 871 deletions
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig index 74f83f4..d9ac24e 100644 --- a/arch/powerpc/configs/cell_defconfig +++ b/arch/powerpc/configs/cell_defconfig @@ -1455,7 +1455,8 @@ CONFIG_HAS_DMA=y # Instrumentation Support # CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m +CONFIG_OPROFILE_CELL=y # CONFIG_KPROBES is not set # diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index d3f2080..37658ea 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -219,6 +219,72 @@ void crash_kexec_secondary(struct pt_regs *regs) cpus_in_sr = CPU_MASK_NONE; } #endif +#ifdef CONFIG_SPU_BASE + +#include <asm/spu.h> +#include <asm/spu_priv1.h> + +struct crash_spu_info { + struct spu *spu; + u32 saved_spu_runcntl_RW; + u32 saved_spu_status_R; + u32 saved_spu_npc_RW; + u64 saved_mfc_sr1_RW; + u64 saved_mfc_dar; + u64 saved_mfc_dsisr; +}; + +#define CRASH_NUM_SPUS 16 /* Enough for current hardware */ +static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS]; + +static void crash_kexec_stop_spus(void) +{ + struct spu *spu; + int i; + u64 tmp; + + for (i = 0; i < CRASH_NUM_SPUS; i++) { + if (!crash_spu_info[i].spu) + continue; + + spu = crash_spu_info[i].spu; + + crash_spu_info[i].saved_spu_runcntl_RW = + in_be32(&spu->problem->spu_runcntl_RW); + crash_spu_info[i].saved_spu_status_R = + in_be32(&spu->problem->spu_status_R); + crash_spu_info[i].saved_spu_npc_RW = + in_be32(&spu->problem->spu_npc_RW); + + crash_spu_info[i].saved_mfc_dar = spu_mfc_dar_get(spu); + crash_spu_info[i].saved_mfc_dsisr = spu_mfc_dsisr_get(spu); + tmp = spu_mfc_sr1_get(spu); + crash_spu_info[i].saved_mfc_sr1_RW = tmp; + + tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK; + spu_mfc_sr1_set(spu, tmp); + + __delay(200); + } +} + +void crash_register_spus(struct list_head *list) +{ + struct spu *spu; + + list_for_each_entry(spu, list, full_list) { + if (WARN_ON(spu->number >= CRASH_NUM_SPUS)) + continue; + + crash_spu_info[spu->number].spu = spu; + } +} + +#else +static inline void crash_kexec_stop_spus(void) +{ +} +#endif /* CONFIG_SPU_BASE */ void default_machine_crash_shutdown(struct pt_regs *regs) { @@ -254,6 +320,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, crashing_cpu); crash_kexec_prepare_cpus(crashing_cpu); cpu_set(crashing_cpu, cpus_in_crash); + crash_kexec_stop_spus(); if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(1, 0); } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e5df167..727a669 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -122,6 +122,7 @@ extern struct timezone sys_tz; static long timezone_offset; unsigned long ppc_proc_freq; +EXPORT_SYMBOL(ppc_proc_freq); unsigned long ppc_tb_freq; static u64 tb_last_jiffy __cacheline_aligned_in_smp; diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig index eb2dece..7089e79 100644 --- a/arch/powerpc/oprofile/Kconfig +++ b/arch/powerpc/oprofile/Kconfig @@ -15,3 +15,10 @@ config OPROFILE If unsure, say N. +config OPROFILE_CELL + bool "OProfile for Cell Broadband Engine" + depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m) + default y + help + Profiling of Cell BE SPUs requires special support enabled + by this option. diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile index 4b5f952..c5f64c3 100644 --- a/arch/powerpc/oprofile/Makefile +++ b/arch/powerpc/oprofile/Makefile @@ -11,7 +11,9 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) common.o backtrace.o -oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o +oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \ + cell/spu_profiler.o cell/vma_map.o \ + cell/spu_task_sync.o oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o op_model_pa6t.o oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o oprofile-$(CONFIG_6xx) += op_model_7450.o diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h new file mode 100644 index 0000000..e5704f0 --- /dev/null +++ b/arch/powerpc/oprofile/cell/pr_util.h @@ -0,0 +1,97 @@ + /* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <maynardj@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef PR_UTIL_H +#define PR_UTIL_H + +#include <linux/cpumask.h> +#include <linux/oprofile.h> +#include <asm/cell-pmu.h> +#include <asm/spu.h> + +#include "../../platforms/cell/cbe_regs.h" + +/* Defines used for sync_start */ +#define SKIP_GENERIC_SYNC 0 +#define SYNC_START_ERROR -1 +#define DO_GENERIC_SYNC 1 + +struct spu_overlay_info { /* map of sections within an SPU overlay */ + unsigned int vma; /* SPU virtual memory address from elf */ + unsigned int size; /* size of section from elf */ + unsigned int offset; /* offset of section into elf file */ + unsigned int buf; +}; + +struct vma_to_fileoffset_map { /* map of sections within an SPU program */ + struct vma_to_fileoffset_map *next; /* list pointer */ + unsigned int vma; /* SPU virtual memory address from elf */ + unsigned int size; /* size of section from elf */ + unsigned int offset; /* offset of section into elf file */ + unsigned int guard_ptr; + unsigned int guard_val; + /* + * The guard pointer is an entry in the _ovly_buf_table, + * computed using ovly.buf as the index into the table. Since + * ovly.buf values begin at '1' to reference the first (or 0th) + * entry in the _ovly_buf_table, the computation subtracts 1 + * from ovly.buf. + * The guard value is stored in the _ovly_buf_table entry and + * is an index (starting at 1) back to the _ovly_table entry + * that is pointing at this _ovly_buf_table entry. So, for + * example, for an overlay scenario with one overlay segment + * and two overlay sections: + * - Section 1 points to the first entry of the + * _ovly_buf_table, which contains a guard value + * of '1', referencing the first (index=0) entry of + * _ovly_table. + * - Section 2 points to the second entry of the + * _ovly_buf_table, which contains a guard value + * of '2', referencing the second (index=1) entry of + * _ovly_table. + */ + +}; + +/* The three functions below are for maintaining and accessing + * the vma-to-fileoffset map. + */ +struct vma_to_fileoffset_map *create_vma_map(const struct spu *spu, + u64 objectid); +unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map, + unsigned int vma, const struct spu *aSpu, + int *grd_val); +void vma_map_free(struct vma_to_fileoffset_map *map); + +/* + * Entry point for SPU profiling. + * cycles_reset is the SPU_CYCLES count value specified by the user. + */ +int start_spu_profiling(unsigned int cycles_reset); + +void stop_spu_profiling(void); + + +/* add the necessary profiling hooks */ +int spu_sync_start(void); + +/* remove the hooks */ +int spu_sync_stop(void); + +/* Record SPU program counter samples to the oprofile event buffer. */ +void spu_sync_buffer(int spu_num, unsigned int *samples, + int num_samples); + +void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset); + +#endif /* PR_UTIL_H */ diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c new file mode 100644 index 0000000..380d7e2 --- /dev/null +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@ -0,0 +1,221 @@ +/* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Authors: Maynard Johnson <maynardj@us.ibm.com> + * Carl Love <carll@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/hrtimer.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <asm/cell-pmu.h> +#include "pr_util.h" + +#define TRACE_ARRAY_SIZE 1024 +#define SCALE_SHIFT 14 + +static u32 *samples; + +static int spu_prof_running; +static unsigned int profiling_interval; + +#define NUM_SPU_BITS_TRBUF 16 +#define SPUS_PER_TB_ENTRY 4 +#define SPUS_PER_NODE 8 + +#define SPU_PC_MASK 0xFFFF + +static DEFINE_SPINLOCK(sample_array_lock); +unsigned long sample_array_lock_flags; + +void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset) +{ + unsigned long ns_per_cyc; + + if (!freq_khz) + freq_khz = ppc_proc_freq/1000; + + /* To calculate a timeout in nanoseconds, the basic + * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency). + * To avoid floating point math, we use the scale math + * technique as described in linux/jiffies.h. We use + * a scale factor of SCALE_SHIFT, which provides 4 decimal places + * of precision. This is close enough for the purpose at hand. + * + * The value of the timeout should be small enough that the hw + * trace buffer will not get more then about 1/3 full for the + * maximum user specified (the LFSR value) hw sampling frequency. + * This is to ensure the trace buffer will never fill even if the + * kernel thread scheduling varies under a heavy system load. + */ + + ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz; + profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT; + +} + +/* + * Extract SPU PC from trace buffer entry + */ +static void spu_pc_extract(int cpu, int entry) +{ + /* the trace buffer is 128 bits */ + u64 trace_buffer[2]; + u64 spu_mask; + int spu; + + spu_mask = SPU_PC_MASK; + + /* Each SPU PC is 16 bits; hence, four spus in each of + * the two 64-bit buffer entries that make up the + * 128-bit trace_buffer entry. Process two 64-bit values + * simultaneously. + * trace[0] SPU PC contents are: 0 1 2 3 + * trace[1] SPU PC contents are: 4 5 6 7 + */ + + cbe_read_trace_buffer(cpu, trace_buffer); + + for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) { + /* spu PC trace entry is upper 16 bits of the + * 18 bit SPU program counter + */ + samples[spu * TRACE_ARRAY_SIZE + entry] + = (spu_mask & trace_buffer[0]) << 2; + samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry] + = (spu_mask & trace_buffer[1]) << 2; + + trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF; + trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF; + } +} + +static int cell_spu_pc_collection(int cpu) +{ + u32 trace_addr; + int entry; + + /* process the collected SPU PC for the node */ + + entry = 0; + + trace_addr = cbe_read_pm(cpu, trace_address); + while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) { + /* there is data in the trace buffer to process */ + spu_pc_extract(cpu, entry); + + entry++; + + if (entry >= TRACE_ARRAY_SIZE) + /* spu_samples is full */ + break; + + trace_addr = cbe_read_pm(cpu, trace_address); + } + + return entry; +} + + +static enum hrtimer_restart profile_spus(struct hrtimer *timer) +{ + ktime_t kt; + int cpu, node, k, num_samples, spu_num; + + if (!spu_prof_running) + goto stop; + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + node = cbe_cpu_to_node(cpu); + + /* There should only be one kernel thread at a time processing + * the samples. In the very unlikely case that the processing + * is taking a very long time and multiple kernel threads are + * started to process the samples. Make sure only one kernel + * thread is working on the samples array at a time. The + * sample array must be loaded and then processed for a given + * cpu. The sample array is not per cpu. + */ + spin_lock_irqsave(&sample_array_lock, + sample_array_lock_flags); + num_samples = cell_spu_pc_collection(cpu); + + if (num_samples == 0) { + spin_unlock_irqrestore(&sample_array_lock, + sample_array_lock_flags); + continue; + } + + for (k = 0; k < SPUS_PER_NODE; k++) { + spu_num = k + (node * SPUS_PER_NODE); + spu_sync_buffer(spu_num, + samples + (k * TRACE_ARRAY_SIZE), + num_samples); + } + + spin_unlock_irqrestore(&sample_array_lock, + sample_array_lock_flags); + + } + smp_wmb(); /* insure spu event buffer updates are written */ + /* don't want events intermingled... */ + + kt = ktime_set(0, profiling_interval); + if (!spu_prof_running) + goto stop; + hrtimer_forward(timer, timer->base->get_time(), kt); + return HRTIMER_RESTART; + + stop: + printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n"); + return HRTIMER_NORESTART; +} + +static struct hrtimer timer; +/* + * Entry point for SPU profiling. + * NOTE: SPU profiling is done system-wide, not per-CPU. + * + * cycles_reset is the count value specified by the user when + * setting up OProfile to count SPU_CYCLES. + */ +int start_spu_profiling(unsigned int cycles_reset) +{ + ktime_t kt; + + pr_debug("timer resolution: %lu\n", TICK_NSEC); + kt = ktime_set(0, profiling_interval); + hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer.expires = kt; + timer.function = profile_spus; + + /* Allocate arrays for collecting SPU PC samples */ + samples = kzalloc(SPUS_PER_NODE * + TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL); + + if (!samples) + return -ENOMEM; + + spu_prof_running = 1; + hrtimer_start(&timer, kt, HRTIMER_MODE_REL); + + return 0; +} + +void stop_spu_profiling(void) +{ + spu_prof_running = 0; + hrtimer_cancel(&timer); + kfree(samples); + pr_debug("SPU_PROF: stop_spu_profiling issued\n"); +} diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c new file mode 100644 index 0000000..1336657 --- /dev/null +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c @@ -0,0 +1,484 @@ +/* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <maynardj@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* The purpose of this file is to handle SPU event task switching + * and to record SPU context information into the OProfile + * event buffer. + * + * Additionally, the spu_sync_buffer function is provided as a helper + * for recoding actual SPU program counter samples to the event buffer. + */ +#include <linux/dcookies.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/numa.h> +#include <linux/oprofile.h> +#include <linux/spinlock.h> +#include "pr_util.h" + +#define RELEASE_ALL 9999 + +static DEFINE_SPINLOCK(buffer_lock); +static DEFINE_SPINLOCK(cache_lock); +static int num_spu_nodes; +int spu_prof_num_nodes; +int last_guard_val[MAX_NUMNODES * 8]; + +/* Container for caching information about an active SPU task. */ +struct cached_info { + struct vma_to_fileoffset_map *map; + struct spu *the_spu; /* needed to access pointer to local_store */ + struct kref cache_ref; +}; + +static struct cached_info *spu_info[MAX_NUMNODES * 8]; + +static void destroy_cached_info(struct kref *kref) +{ + struct cached_info *info; + + info = container_of(kref, struct cached_info, cache_ref); + vma_map_free(info->map); + kfree(info); + module_put(THIS_MODULE); +} + +/* Return the cached_info for the passed SPU number. + * ATTENTION: Callers are responsible for obtaining the + * cache_lock if needed prior to invoking this function. + */ +static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num) +{ + struct kref *ref; + struct cached_info *ret_info; + + if (spu_num >= num_spu_nodes) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Invalid index %d into spu info cache\n", + __FUNCTION__, __LINE__, spu_num); + ret_info = NULL; + goto out; + } + if (!spu_info[spu_num] && the_spu) { + ref = spu_get_profile_private_kref(the_spu->ctx); + if (ref) { + spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref); + kref_get(&spu_info[spu_num]->cache_ref); + } + } + + ret_info = spu_info[spu_num]; + out: + return ret_info; +} + + +/* Looks for cached info for the passed spu. If not found, the + * cached info is created for the passed spu. + * Returns 0 for success; otherwise, -1 for error. + */ +static int +prepare_cached_spu_info(struct spu *spu, unsigned long objectId) +{ + unsigned long flags; + struct vma_to_fileoffset_map *new_map; + int retval = 0; + struct cached_info *info; + + /* We won't bother getting cache_lock here since + * don't do anything with the cached_info that's returned. + */ + info = get_cached_info(spu, spu->number); + + if (info) { + pr_debug("Found cached SPU info.\n"); + goto out; + } + + /* Create cached_info and set spu_info[spu->number] to point to it. + * spu->number is a system-wide value, not a per-node value. + */ + info = kzalloc(sizeof(struct cached_info), GFP_KERNEL); + if (!info) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: create vma_map failed\n", + __FUNCTION__, __LINE__); + retval = -ENOMEM; + goto err_alloc; + } + new_map = create_vma_map(spu, objectId); + if (!new_map) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: create vma_map failed\n", + __FUNCTION__, __LINE__); + retval = -ENOMEM; + goto err_alloc; + } + + pr_debug("Created vma_map\n"); + info->map = new_map; + info->the_spu = spu; + kref_init(&info->cache_ref); + spin_lock_irqsave(&cache_lock, flags); + spu_info[spu->number] = info; + /* Increment count before passing off ref to SPUFS. */ + kref_get(&info->cache_ref); + + /* We increment the module refcount here since SPUFS is + * responsible for the final destruction of the cached_info, + * and it must be able to access the destroy_cached_info() + * function defined in the OProfile module. We decrement + * the module refcount in destroy_cached_info. + */ + try_module_get(THIS_MODULE); + spu_set_profile_private_kref(spu->ctx, &info->cache_ref, + destroy_cached_info); + spin_unlock_irqrestore(&cache_lock, flags); + goto out; + +err_alloc: + kfree(info); +out: + return retval; +} + +/* + * NOTE: The caller is responsible for locking the + * cache_lock prior to calling this function. + */ +static int release_cached_info(int spu_index) +{ + int index, end; + + if (spu_index == RELEASE_ALL) { + end = num_spu_nodes; + index = 0; + } else { + if (spu_index >= num_spu_nodes) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: " + "Invalid index %d into spu info cache\n", + __FUNCTION__, __LINE__, spu_index); + goto out; + } + end = spu_index + 1; + index = spu_index; + } + for (; index < end; index++) { + if (spu_info[index]) { + kref_put(&spu_info[index]->cache_ref, + destroy_cached_info); + spu_info[index] = NULL; + } + } + +out: + return 0; +} + +/* The source code for fast_get_dcookie was "borrowed" + * from drivers/oprofile/buffer_sync.c. + */ + +/* Optimisation. We can manage without taking the dcookie sem + * because we cannot reach this code without at least one + * dcookie user still being registered (namely, the reader + * of the event buffer). + */ +static inline unsigned long fast_get_dcookie(struct dentry *dentry, + struct vfsmount *vfsmnt) +{ + unsigned long cookie; + + if (dentry->d_cookie) + return (unsigned long)dentry; + get_dcookie(dentry, vfsmnt, &cookie); + return cookie; +} + +/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, + * which corresponds loosely to "application name". Also, determine + * the offset for the SPU ELF object. If computed offset is + * non-zero, it implies an embedded SPU object; otherwise, it's a + * separate SPU binary, in which case we retrieve it's dcookie. + * For the embedded case, we must determine if SPU ELF is embedded + * in the executable application or another file (i.e., shared lib). + * If embedded in a shared lib, we must get the dcookie and return + * that to the caller. + */ +static unsigned long +get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp, + unsigned long *spu_bin_dcookie, + unsigned long spu_ref) +{ + unsigned long app_cookie = 0; + unsigned int my_offset = 0; + struct file *app = NULL; + struct vm_area_struct *vma; + struct mm_struct *mm = spu->mm; + + if (!mm) + goto out; + + down_read(&mm->mmap_sem); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (!(vma->vm_flags & VM_EXECUTABLE)) + continue; + app_cookie = fast_get_dcookie(vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + pr_debug("got dcookie for %s\n", + vma->vm_file->f_dentry->d_name.name); + app = vma->vm_file; + break; + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref) + continue; + my_offset = spu_ref - vma->vm_start; + if (!vma->vm_file) + goto fail_no_image_cookie; + + pr_debug("Found spu ELF at %X(object-id:%lx) for file %s\n", + my_offset, spu_ref, + vma->vm_file->f_dentry->d_name.name); + *offsetp = my_offset; + break; + } + + *spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name); + + up_read(&mm->mmap_sem); + +out: + return app_cookie; + +fail_no_image_cookie: + up_read(&mm->mmap_sem); + + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Cannot find dcookie for SPU binary\n", + __FUNCTION__, __LINE__); + goto out; +} + + + +/* This function finds or creates cached context information for the + * passed SPU and records SPU context information into the OProfile + * event buffer. + */ +static int process_context_switch(struct spu *spu, unsigned long objectId) +{ + unsigned long flags; + int retval; + unsigned int offset = 0; + unsigned long spu_cookie = 0, app_dcookie; + + retval = prepare_cached_spu_info(spu, objectId); + if (retval) + goto out; + + /* Get dcookie first because a mutex_lock is taken in that + * code path, so interrupts must not be disabled. + */ + app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId); + if (!app_dcookie || !spu_cookie) { + retval = -ENOENT; + goto out; + } + + /* Record context info in event buffer */ + spin_lock_irqsave(&buffer_lock, flags); + add_event_entry(ESCAPE_CODE); + add_event_entry(SPU_CTX_SWITCH_CODE); + add_event_entry(spu->number); + add_event_entry(spu->pid); + add_event_entry(spu->tgid); + add_event_entry(app_dcookie); + add_event_entry(spu_cookie); + add_event_entry(offset); + spin_unlock_irqrestore(&buffer_lock, flags); + smp_wmb(); /* insure spu event buffer updates are written */ + /* don't want entries intermingled... */ +out: + return retval; +} + +/* + * This function is invoked on either a bind_context or unbind_context. + * If called for an unbind_context, the val arg is 0; otherwise, + * it is the object-id value for the spu context. + * The data arg is of type 'struct spu *'. + */ +static int spu_active_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + int retval; + unsigned long flags; + struct spu *the_spu = data; + + pr_debug("SPU event notification arrived\n"); + if (!val) { + spin_lock_irqsave(&cache_lock, flags); + retval = release_cached_info(the_spu->number); + spin_unlock_irqrestore(&cache_lock, flags); + } else { + retval = process_context_switch(the_spu, val); + } + return retval; +} + +static struct notifier_block spu_active = { + .notifier_call = spu_active_notify, +}; + +static int number_of_online_nodes(void) +{ + u32 cpu; u32 tmp; + int nodes = 0; + for_each_online_cpu(cpu) { + tmp = cbe_cpu_to_node(cpu) + 1; + if (tmp > nodes) + nodes++; + } + return nodes; +} + +/* The main purpose of this function is to synchronize + * OProfile with SPUFS by registering to be notified of + * SPU task switches. + * + * NOTE: When profiling SPUs, we must ensure that only + * spu_sync_start is invoked and not the generic sync_start + * in drivers/oprofile/oprof.c. A return value of + * SKIP_GENERIC_SYNC or SYNC_START_ERROR will + * accomplish this. + */ +int spu_sync_start(void) +{ + int k; + int ret = SKIP_GENERIC_SYNC; + int register_ret; + unsigned long flags = 0; + + spu_prof_num_nodes = number_of_online_nodes(); + num_spu_nodes = spu_prof_num_nodes * 8; + + spin_lock_irqsave(&buffer_lock, flags); + add_event_entry(ESCAPE_CODE); + add_event_entry(SPU_PROFILING_CODE); + add_event_entry(num_spu_nodes); + spin_unlock_irqrestore(&buffer_lock, flags); + + /* Register for SPU events */ + register_ret = spu_switch_event_register(&spu_active); + if (register_ret) { + ret = SYNC_START_ERROR; + goto out; + } + + for (k = 0; k < (MAX_NUMNODES * 8); k++) + last_guard_val[k] = 0; + pr_debug("spu_sync_start -- running.\n"); +out: + return ret; +} + +/* Record SPU program counter samples to the oprofile event buffer. */ +void spu_sync_buffer(int spu_num, unsigned int *samples, + int num_samples) +{ + unsigned long long file_offset; + unsigned long flags; + int i; + struct vma_to_fileoffset_map *map; + struct spu *the_spu; + unsigned long long spu_num_ll = spu_num; + unsigned long long spu_num_shifted = spu_num_ll << 32; + struct cached_info *c_info; + + /* We need to obtain the cache_lock here because it's + * possible that after getting the cached_info, the SPU job + * corresponding to this cached_info may end, thus resulting + * in the destruction of the cached_info. + */ + spin_lock_irqsave(&cache_lock, flags); + c_info = get_cached_info(NULL, spu_num); + if (!c_info) { + /* This legitimately happens when the SPU task ends before all + * samples are recorded. + * No big deal -- so we just drop a few samples. + */ + pr_debug("SPU_PROF: No cached SPU contex " + "for SPU #%d. Dropping samples.\n", spu_num); + goto out; + } + + map = c_info->map; + the_spu = c_info->the_spu; + spin_lock(&buffer_lock); + for (i = 0; i < num_samples; i++) { + unsigned int sample = *(samples+i); + int grd_val = 0; + file_offset = 0; + if (sample == 0) + continue; + file_offset = vma_map_lookup( map, sample, the_spu, &grd_val); + + /* If overlays are used by this SPU application, the guard + * value is non-zero, indicating which overlay section is in + * use. We need to discard samples taken during the time + * period which an overlay occurs (i.e., guard value changes). + */ + if (grd_val && grd_val != last_guard_val[spu_num]) { + last_guard_val[spu_num] = grd_val; + /* Drop the rest of the samples. */ + break; + } + + add_event_entry(file_offset | spu_num_shifted); + } + spin_unlock(&buffer_lock); +out: + spin_unlock_irqrestore(&cache_lock, flags); +} + + +int spu_sync_stop(void) +{ + unsigned long flags = 0; + int ret = spu_switch_event_unregister(&spu_active); + if (ret) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: spu_switch_event_unregister returned %d\n", + __FUNCTION__, __LINE__, ret); + goto out; + } + + spin_lock_irqsave(&cache_lock, flags); + ret = release_cached_info(RELEASE_ALL); + spin_unlock_irqrestore(&cache_lock, flags); +out: + pr_debug("spu_sync_stop -- done.\n"); + return ret; +} + + diff --git a/arch/powerpc/oprofile/cell/vma_map.c b/arch/powerpc/oprofile/cell/vma_map.c new file mode 100644 index 0000000..76ec1d1 --- /dev/null +++ b/arch/powerpc/oprofile/cell/vma_map.c @@ -0,0 +1,287 @@ +/* + * Cell Broadband Engine OProfile Support + * + * (C) Copyright IBM Corporation 2006 + * + * Author: Maynard Johnson <maynardj@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* The code in this source file is responsible for generating + * vma-to-fileOffset maps for both overlay and non-overlay SPU + * applications. + */ + +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/elf.h> +#include "pr_util.h" + + +void vma_map_free(struct vma_to_fileoffset_map *map) +{ + while (map) { + struct vma_to_fileoffset_map *next = map->next; + kfree(map); + map = next; + } +} + +unsigned int +vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma, + const struct spu *aSpu, int *grd_val) +{ + /* + * Default the offset to the physical address + a flag value. + * Addresses of dynamically generated code can't be found in the vma + * map. For those addresses the flagged value will be sent on to + * the user space tools so they can be reported rather than just + * thrown away. + */ + u32 offset = 0x10000000 + vma; + u32 ovly_grd; + + for (; map; map = map->next) { + if (vma < map->vma || vma >= map->vma + map->size) + continue; + + if (map->guard_ptr) { + ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr); + if (ovly_grd != map->guard_val) + continue; + *grd_val = ovly_grd; + } + offset = vma - map->vma + map->offset; + break; + } + + return offset; +} + +static struct vma_to_fileoffset_map * +vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma, + unsigned int size, unsigned int offset, unsigned int guard_ptr, + unsigned int guard_val) +{ + struct vma_to_fileoffset_map *new = + kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL); + if (!new) { + printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n", + __FUNCTION__, __LINE__); + vma_map_free(map); + return NULL; + } + + new->next = map; + new->vma = vma; + new->size = size; + new->offset = offset; + new->guard_ptr = guard_ptr; + new->guard_val = guard_val; + + return new; +} + + +/* Parse SPE ELF header and generate a list of vma_maps. + * A pointer to the first vma_map in the generated list + * of vma_maps is returned. */ +struct vma_to_fileoffset_map *create_vma_map(const struct spu *aSpu, + unsigned long spu_elf_start) +{ + static const unsigned char expected[EI_PAD] = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELFCLASS32, + [EI_DATA] = ELFDATA2MSB, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELFOSABI_NONE + }; + + int grd_val; + struct vma_to_fileoffset_map *map = NULL; + struct spu_overlay_info ovly; + unsigned int overlay_tbl_offset = -1; + unsigned long phdr_start, shdr_start; + Elf32_Ehdr ehdr; + Elf32_Phdr phdr; + Elf32_Shdr shdr, shdr_str; + Elf32_Sym sym; + int i, j; + char name[32]; + + unsigned int ovly_table_sym = 0; + unsigned int ovly_buf_table_sym = 0; + unsigned int ovly_table_end_sym = 0; + unsigned int ovly_buf_table_end_sym = 0; + unsigned long ovly_table; + unsigned int n_ovlys; + + /* Get and validate ELF header. */ + + if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr))) + goto fail; + + if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_ident parsing SPU ELF\n", + __FUNCTION__, __LINE__); + goto fail; + } + if (ehdr.e_machine != EM_SPU) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_machine parsing SPU ELF\n", + __FUNCTION__, __LINE__); + goto fail; + } + if (ehdr.e_type != ET_EXEC) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Unexpected e_type parsing SPU ELF\n", + __FUNCTION__, __LINE__); + goto fail; + } + phdr_start = spu_elf_start + ehdr.e_phoff; + shdr_start = spu_elf_start + ehdr.e_shoff; + + /* Traverse program headers. */ + for (i = 0; i < ehdr.e_phnum; i++) { + if (copy_from_user(&phdr, + (void *) (phdr_start + i * sizeof(phdr)), + sizeof(phdr))) + goto fail; + + if (phdr.p_type != PT_LOAD) + continue; + if (phdr.p_flags & (1 << 27)) + continue; + + map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz, + phdr.p_offset, 0, 0); + if (!map) + goto fail; + } + + pr_debug("SPU_PROF: Created non-overlay maps\n"); + /* Traverse section table and search for overlay-related symbols. */ + for (i = 0; i < ehdr.e_shnum; i++) { + if (copy_from_user(&shdr, + (void *) (shdr_start + i * sizeof(shdr)), + sizeof(shdr))) + goto fail; + + if (shdr.sh_type != SHT_SYMTAB) + continue; + if (shdr.sh_entsize != sizeof (sym)) + continue; + + if (copy_from_user(&shdr_str, + (void *) (shdr_start + shdr.sh_link * + sizeof(shdr)), + sizeof(shdr))) + goto fail; + + if (shdr_str.sh_type != SHT_STRTAB) + goto fail;; + + for (j = 0; j < shdr.sh_size / sizeof (sym); j++) { + if (copy_from_user(&sym, (void *) (spu_elf_start + + shdr.sh_offset + j * + sizeof (sym)), + sizeof (sym))) + goto fail; + + if (copy_from_user(name, (void *) + (spu_elf_start + shdr_str.sh_offset + + sym.st_name), + 20)) + goto fail; + + if (memcmp(name, "_ovly_table", 12) == 0) + ovly_table_sym = sym.st_value; + if (memcmp(name, "_ovly_buf_table", 16) == 0) + ovly_buf_table_sym = sym.st_value; + if (memcmp(name, "_ovly_table_end", 16) == 0) + ovly_table_end_sym = sym.st_value; + if (memcmp(name, "_ovly_buf_table_end", 20) == 0) + ovly_buf_table_end_sym = sym.st_value; + } + } + + /* If we don't have overlays, we're done. */ + if (ovly_table_sym == 0 || ovly_buf_table_sym == 0 + || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) { + pr_debug("SPU_PROF: No overlay table found\n"); + goto out; + } else { + pr_debug("SPU_PROF: Overlay table found\n"); + } + + /* The _ovly_table symbol represents a table with one entry + * per overlay section. The _ovly_buf_table symbol represents + * a table with one entry per overlay region. + * The struct spu_overlay_info gives the structure of the _ovly_table + * entries. The structure of _ovly_table_buf is simply one + * u32 word per entry. + */ + overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym, + aSpu, &grd_val); + if (overlay_tbl_offset < 0) { + printk(KERN_ERR "SPU_PROF: " + "%s, line %d: Error finding SPU overlay table\n", + __FUNCTION__, __LINE__); + goto fail; + } + ovly_table = spu_elf_start + overlay_tbl_offset; + + n_ovlys = (ovly_table_end_sym - + ovly_table_sym) / sizeof (ovly); + + /* Traverse overlay table. */ + for (i = 0; i < n_ovlys; i++) { + if (copy_from_user(&ovly, (void *) + (ovly_table + i * sizeof (ovly)), + sizeof (ovly))) + goto fail; + + /* The ovly.vma/size/offset arguments are analogous to the same + * arguments used above for non-overlay maps. The final two + * args are referred to as the guard pointer and the guard + * value. + * The guard pointer is an entry in the _ovly_buf_table, + * computed using ovly.buf as the index into the table. Since + * ovly.buf values begin at '1' to reference the first (or 0th) + * entry in the _ovly_buf_table, the computation subtracts 1 + * from ovly.buf. + * The guard value is stored in the _ovly_buf_table entry and + * is an index (starting at 1) back to the _ovly_table entry + * that is pointing at this _ovly_buf_table entry. So, for + * example, for an overlay scenario with one overlay segment + * and two overlay sections: + * - Section 1 points to the first entry of the + * _ovly_buf_table, which contains a guard value + * of '1', referencing the first (index=0) entry of + * _ovly_table. + * - Section 2 points to the second entry of the + * _ovly_buf_table, which contains a guard value + * of '2', referencing the second (index=1) entry of + * _ovly_table. + */ + map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset, + ovly_buf_table_sym + (ovly.buf-1) * 4, i+1); + if (!map) + goto fail; + } + goto out; + + fail: + map = NULL; + out: + return map; +} diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c index 1a7ef7e..a28cce1 100644 --- a/arch/powerpc/oprofile/common.c +++ b/arch/powerpc/oprofile/common.c @@ -29,6 +29,8 @@ static struct op_powerpc_model *model; static struct op_counter_config ctr[OP_MAX_COUNTER]; static struct op_system_config sys; +static int op_per_cpu_rc; + static void op_handle_interrupt(struct pt_regs *regs) { model->handle_interrupt(regs, ctr); @@ -36,25 +38,41 @@ static void op_handle_interrupt(struct pt_regs *regs) static void op_powerpc_cpu_setup(void *dummy) { - model->cpu_setup(ctr); + int ret; + + ret = model->cpu_setup(ctr); + + if (ret != 0) + op_per_cpu_rc = ret; } static int op_powerpc_setup(void) { int err; + op_per_cpu_rc = 0; + /* Grab the hardware */ err = reserve_pmc_hardware(op_handle_interrupt); if (err) return err; /* Pre-compute the values to stuff in the hardware registers. */ - model->reg_setup(ctr, &sys, model->num_counters); + op_per_cpu_rc = model->reg_setup(ctr, &sys, model->num_counters); - /* Configure the registers on all cpus. */ + if (op_per_cpu_rc) + goto out; + + /* Configure the registers on all cpus. If an error occurs on one + * of the cpus, op_per_cpu_rc will be set to the error */ on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1); - return 0; +out: if (op_per_cpu_rc) { + /* error on setup release the performance counter hardware */ + release_pmc_hardware(); + } + + return op_per_cpu_rc; } static void op_powerpc_shutdown(void) @@ -64,16 +82,29 @@ static void op_powerpc_shutdown(void) static void op_powerpc_cpu_start(void *dummy) { - model->start(ctr); + /* If any of the cpus have return an error, set the + * global flag to the error so it can be returned + * to the generic OProfile caller. + */ + int ret; + + ret = model->start(ctr); + if (ret != 0) + op_per_cpu_rc = ret; } static int op_powerpc_start(void) { + op_per_cpu_rc = 0; + if (model->global_start) - model->global_start(ctr); - if (model->start) + return model->global_start(ctr); + if (model->start) { on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1); - return 0; + return op_per_cpu_rc; + } + return -EIO; /* No start function is defined for this + power architecture */ } static inline void op_powerpc_cpu_stop(void *dummy) @@ -147,11 +178,13 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) switch (cur_cpu_spec->oprofile_type) { #ifdef CONFIG_PPC64 -#ifdef CONFIG_PPC_CELL_NATIVE +#ifdef CONFIG_OPROFILE_CELL case PPC_OPROFILE_CELL: if (firmware_has_feature(FW_FEATURE_LPAR)) return -ENODEV; model = &op_model_cell; + ops->sync_start = model->sync_start; + ops->sync_stop = model->sync_stop; break; #endif case PPC_OPROFILE_RS64: diff --git a/arch/powerpc/oprofile/op_model_7450.c b/arch/powerpc/oprofile/op_model_7450.c index 5d1bbaf..cc599eb 100644 --- a/arch/powerpc/oprofile/op_model_7450.c +++ b/arch/powerpc/oprofile/op_model_7450.c @@ -81,7 +81,7 @@ static void pmc_stop_ctrs(void) /* Configures the counters on this CPU based on the global * settings */ -static void fsl7450_cpu_setup(struct op_counter_config *ctr) +static int fsl7450_cpu_setup(struct op_counter_config *ctr) { /* freeze all counters */ pmc_stop_ctrs(); @@ -89,12 +89,14 @@ static void fsl7450_cpu_setup(struct op_counter_config *ctr) mtspr(SPRN_MMCR0, mmcr0_val); mtspr(SPRN_MMCR1, mmcr1_val); mtspr(SPRN_MMCR2, mmcr2_val); + + return 0; } #define NUM_CTRS 6 /* Configures the global settings for the countes on all CPUs. */ -static void fsl7450_reg_setup(struct op_counter_config *ctr, +static int fsl7450_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { @@ -126,10 +128,12 @@ static void fsl7450_reg_setup(struct op_counter_config *ctr, | mmcr1_event6(ctr[5].event); mmcr2_val = 0; + + return 0; } /* Sets the counters on this CPU to the chosen values, and starts them */ -static void fsl7450_start(struct op_counter_config *ctr) +static int fsl7450_start(struct op_counter_config *ctr) { int i; @@ -148,6 +152,8 @@ static void fsl7450_start(struct op_counter_config *ctr) pmc_start_ctrs(); oprofile_running = 1; + + return 0; } /* Stop the counters on this CPU */ @@ -193,7 +199,7 @@ static void fsl7450_handle_interrupt(struct pt_regs *regs, /* The freeze bit was set by the interrupt. */ /* Clear the freeze bit, and reenable the interrupt. * The counters won't actually start until the rfi clears - * the PMM bit */ + * the PM/M bit */ pmc_start_ctrs(); } diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index c29293b..d928b54 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -5,8 +5,8 @@ * * Author: David Erb (djerb@us.ibm.com) * Modifications: - * Carl Love <carll@us.ibm.com> - * Maynard Johnson <maynardj@us.ibm.com> + * Carl Love <carll@us.ibm.com> + * Maynard Johnson <maynardj@us.ibm.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -38,12 +38,25 @@ #include "../platforms/cell/interrupt.h" #include "../platforms/cell/cbe_regs.h" +#include "cell/pr_util.h" + +static void cell_global_stop_spu(void); + +/* + * spu_cycle_reset is the number of cycles between samples. + * This variable is used for SPU profiling and should ONLY be set + * at the beginning of cell_reg_setup; otherwise, it's read-only. + */ +static unsigned int spu_cycle_reset; + +#define NUM_SPUS_PER_NODE 8 +#define SPU_CYCLES_EVENT_NUM 2 /* event number for SPU_CYCLES */ #define PPU_CYCLES_EVENT_NUM 1 /* event number for CYCLES */ -#define PPU_CYCLES_GRP_NUM 1 /* special group number for identifying - * PPU_CYCLES event - */ -#define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */ +#define PPU_CYCLES_GRP_NUM 1 /* special group number for identifying + * PPU_CYCLES event + */ +#define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */ #define NUM_THREADS 2 /* number of physical threads in * physical processor @@ -51,6 +64,7 @@ #define NUM_TRACE_BUS_WORDS 4 #define NUM_INPUT_BUS_WORDS 2 +#define MAX_SPU_COUNT 0xFFFFFF /* maximum 24 bit LFSR value */ struct pmc_cntrl_data { unsigned long vcntr; @@ -62,11 +76,10 @@ struct pmc_cntrl_data { /* * ibm,cbe-perftools rtas parameters */ - struct pm_signal { u16 cpu; /* Processor to modify */ - u16 sub_unit; /* hw subunit this applies to (if applicable) */ - short int signal_group; /* Signal Group to Enable/Disable */ + u16 sub_unit; /* hw subunit this applies to (if applicable)*/ + short int signal_group; /* Signal Group to Enable/Disable */ u8 bus_word; /* Enable/Disable on this Trace/Trigger/Event * Bus Word(s) (bitmask) */ @@ -112,21 +125,42 @@ static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values); static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS]; -/* Interpetation of hdw_thread: +/* + * The CELL profiling code makes rtas calls to setup the debug bus to + * route the performance signals. Additionally, SPU profiling requires + * a second rtas call to setup the hardware to capture the SPU PCs. + * The EIO error value is returned if the token lookups or the rtas + * call fail. The EIO error number is the best choice of the existing + * error numbers. The probability of rtas related error is very low. But + * by returning EIO and printing additional information to dmsg the user + * will know that OProfile did not start and dmesg will tell them why. + * OProfile does not support returning errors on Stop. Not a huge issue + * since failure to reset the debug bus or stop the SPU PC collection is + * not a fatel issue. Chances are if the Stop failed, Start doesn't work + * either. + */ + +/* + * Interpetation of hdw_thread: * 0 - even virtual cpus 0, 2, 4,... * 1 - odd virtual cpus 1, 3, 5, ... + * + * FIXME: this is strictly wrong, we need to clean this up in a number + * of places. It works for now. -arnd */ static u32 hdw_thread; static u32 virt_cntr_inter_mask; static struct timer_list timer_virt_cntr; -/* pm_signal needs to be global since it is initialized in +/* + * pm_signal needs to be global since it is initialized in * cell_reg_setup at the time when the necessary information * is available. */ static struct pm_signal pm_signal[NR_PHYS_CTRS]; -static int pm_rtas_token; +static int pm_rtas_token; /* token for debug bus setup call */ +static int spu_rtas_token; /* token for SPU cycle profiling */ static u32 reset_value[NR_PHYS_CTRS]; static int num_counters; @@ -147,8 +181,8 @@ rtas_ibm_cbe_perftools(int subfunc, int passthru, { u64 paddr = __pa(address); - return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru, - paddr >> 32, paddr & 0xffffffff, length); + return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, + passthru, paddr >> 32, paddr & 0xffffffff, length); } static void pm_rtas_reset_signals(u32 node) @@ -156,12 +190,13 @@ static void pm_rtas_reset_signals(u32 node) int ret; struct pm_signal pm_signal_local; - /* The debug bus is being set to the passthru disable state. - * However, the FW still expects atleast one legal signal routing - * entry or it will return an error on the arguments. If we don't - * supply a valid entry, we must ignore all return values. Ignoring - * all return values means we might miss an error we should be - * concerned about. + /* + * The debug bus is being set to the passthru disable state. + * However, the FW still expects atleast one legal signal routing + * entry or it will return an error on the arguments. If we don't + * supply a valid entry, we must ignore all return values. Ignoring + * all return values means we might miss an error we should be + * concerned about. */ /* fw expects physical cpu #. */ @@ -175,18 +210,24 @@ static void pm_rtas_reset_signals(u32 node) &pm_signal_local, sizeof(struct pm_signal)); - if (ret) + if (unlikely(ret)) + /* + * Not a fatal error. For Oprofile stop, the oprofile + * functions do not support returning an error for + * failure to stop OProfile. + */ printk(KERN_WARNING "%s: rtas returned: %d\n", __FUNCTION__, ret); } -static void pm_rtas_activate_signals(u32 node, u32 count) +static int pm_rtas_activate_signals(u32 node, u32 count) { int ret; int i, j; struct pm_signal pm_signal_local[NR_PHYS_CTRS]; - /* There is no debug setup required for the cycles event. + /* + * There is no debug setup required for the cycles event. * Note that only events in the same group can be used. * Otherwise, there will be conflicts in correctly routing * the signals on the debug bus. It is the responsiblity @@ -213,10 +254,14 @@ static void pm_rtas_activate_signals(u32 node, u32 count) pm_signal_local, i * sizeof(struct pm_signal)); - if (ret) + if (unlikely(ret)) { printk(KERN_WARNING "%s: rtas returned: %d\n", __FUNCTION__, ret); + return -EIO; + } } + + return 0; } /* @@ -260,11 +305,12 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask) pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity); pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control); - /* Some of the islands signal selection is based on 64 bit words. + /* + * Some of the islands signal selection is based on 64 bit words. * The debug bus words are 32 bits, the input words to the performance * counters are defined as 32 bits. Need to convert the 64 bit island * specification to the appropriate 32 input bit and bus word for the - * performance counter event selection. See the CELL Performance + * performance counter event selection. See the CELL Performance * monitoring signals manual and the Perf cntr hardware descriptions * for the details. */ @@ -298,6 +344,7 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask) input_bus[j] = i; pm_regs.group_control |= (i << (31 - i)); + break; } } @@ -309,7 +356,8 @@ out: static void write_pm_cntrl(int cpu) { - /* Oprofile will use 32 bit counters, set bits 7:10 to 0 + /* + * Oprofile will use 32 bit counters, set bits 7:10 to 0 * pmregs.pm_cntrl is a global */ @@ -326,7 +374,8 @@ static void write_pm_cntrl(int cpu) if (pm_regs.pm_cntrl.freeze == 1) val |= CBE_PM_FREEZE_ALL_CTRS; - /* Routine set_count_mode must be called previously to set + /* + * Routine set_count_mode must be called previously to set * the count mode based on the user selection of user and kernel. */ val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode); @@ -336,7 +385,8 @@ static void write_pm_cntrl(int cpu) static inline void set_count_mode(u32 kernel, u32 user) { - /* The user must specify user and kernel if they want them. If + /* + * The user must specify user and kernel if they want them. If * neither is specified, OProfile will count in hypervisor mode. * pm_regs.pm_cntrl is a global */ @@ -364,7 +414,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl) /* * Oprofile is expected to collect data on all CPUs simultaneously. - * However, there is one set of performance counters per node. There are + * However, there is one set of performance counters per node. There are * two hardware threads or virtual CPUs on each node. Hence, OProfile must * multiplex in time the performance counter collection on the two virtual * CPUs. The multiplexing of the performance counters is done by this @@ -377,19 +427,19 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl) * pair of per-cpu arrays is used for storing the previous and next * pmc values for a given node. * NOTE: We use the per-cpu variable to improve cache performance. + * + * This routine will alternate loading the virtual counters for + * virtual CPUs */ static void cell_virtual_cntr(unsigned long data) { - /* This routine will alternate loading the virtual counters for - * virtual CPUs - */ int i, prev_hdw_thread, next_hdw_thread; u32 cpu; unsigned long flags; - /* Make sure that the interrupt_hander and - * the virt counter are not both playing with - * the counters on the same node. + /* + * Make sure that the interrupt_hander and the virt counter are + * not both playing with the counters on the same node. */ spin_lock_irqsave(&virt_cntr_lock, flags); @@ -400,22 +450,25 @@ static void cell_virtual_cntr(unsigned long data) hdw_thread = 1 ^ hdw_thread; next_hdw_thread = hdw_thread; - for (i = 0; i < num_counters; i++) - /* There are some per thread events. Must do the + /* + * There are some per thread events. Must do the * set event, for the thread that is being started */ + for (i = 0; i < num_counters; i++) set_pm_event(i, pmc_cntrl[next_hdw_thread][i].evnts, pmc_cntrl[next_hdw_thread][i].masks); - /* The following is done only once per each node, but + /* + * The following is done only once per each node, but * we need cpu #, not node #, to pass to the cbe_xxx functions. */ for_each_online_cpu(cpu) { if (cbe_get_hw_thread_id(cpu)) continue; - /* stop counters, save counter values, restore counts + /* + * stop counters, save counter values, restore counts * for previous thread */ cbe_disable_pm(cpu); @@ -428,7 +481,7 @@ static void cell_virtual_cntr(unsigned long data) == 0xFFFFFFFF) /* If the cntr value is 0xffffffff, we must * reset that to 0xfffffff0 when the current - * thread is restarted. This will generate a + * thread is restarted. This will generate a * new interrupt and make sure that we never * restore the counters to the max value. If * the counters were restored to the max value, @@ -444,13 +497,15 @@ static void cell_virtual_cntr(unsigned long data) next_hdw_thread)[i]); } - /* Switch to the other thread. Change the interrupt + /* + * Switch to the other thread. Change the interrupt * and control regs to be scheduled on the CPU * corresponding to the thread to execute. */ for (i = 0; i < num_counters; i++) { if (pmc_cntrl[next_hdw_thread][i].enabled) { - /* There are some per thread events. + /* + * There are some per thread events. * Must do the set event, enable_cntr * for each cpu. */ @@ -482,17 +537,42 @@ static void start_virt_cntrs(void) } /* This function is called once for all cpus combined */ -static void -cell_reg_setup(struct op_counter_config *ctr, - struct op_system_config *sys, int num_ctrs) +static int cell_reg_setup(struct op_counter_config *ctr, + struct op_system_config *sys, int num_ctrs) { int i, j, cpu; + spu_cycle_reset = 0; + + if (ctr[0].event == SPU_CYCLES_EVENT_NUM) { + spu_cycle_reset = ctr[0].count; + + /* + * Each node will need to make the rtas call to start + * and stop SPU profiling. Get the token once and store it. + */ + spu_rtas_token = rtas_token("ibm,cbe-spu-perftools"); + + if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-spu-perftools unknown\n", + __FUNCTION__); + return -EIO; + } + } pm_rtas_token = rtas_token("ibm,cbe-perftools"); - if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) { - printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n", + + /* + * For all events excetp PPU CYCLEs, each node will need to make + * the rtas cbe-perftools call to setup and reset the debug bus. + * Make the token lookup call once and store it in the global + * variable pm_rtas_token. + */ + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-perftools unknown\n", __FUNCTION__); - goto out; + return -EIO; } num_counters = num_ctrs; @@ -520,7 +600,8 @@ cell_reg_setup(struct op_counter_config *ctr, per_cpu(pmc_values, j)[i] = 0; } - /* Setup the thread 1 events, map the thread 0 event to the + /* + * Setup the thread 1 events, map the thread 0 event to the * equivalent thread 1 event. */ for (i = 0; i < num_ctrs; ++i) { @@ -544,9 +625,10 @@ cell_reg_setup(struct op_counter_config *ctr, for (i = 0; i < NUM_INPUT_BUS_WORDS; i++) input_bus[i] = 0xff; - /* Our counters count up, and "count" refers to + /* + * Our counters count up, and "count" refers to * how much before the next interrupt, and we interrupt - * on overflow. So we calculate the starting value + * on overflow. So we calculate the starting value * which will give us "count" until overflow. * Then we set the events on the enabled counters. */ @@ -569,28 +651,27 @@ cell_reg_setup(struct op_counter_config *ctr, for (i = 0; i < num_counters; ++i) { per_cpu(pmc_values, cpu)[i] = reset_value[i]; } -out: - ; + + return 0; } + + /* This function is called once for each cpu */ -static void cell_cpu_setup(struct op_counter_config *cntr) +static int cell_cpu_setup(struct op_counter_config *cntr) { u32 cpu = smp_processor_id(); u32 num_enabled = 0; int i; + if (spu_cycle_reset) + return 0; + /* There is one performance monitor per processor chip (i.e. node), * so we only need to perform this function once per node. */ if (cbe_get_hw_thread_id(cpu)) - goto out; - - if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) { - printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n", - __FUNCTION__); - goto out; - } + return 0; /* Stop all counters */ cbe_disable_pm(cpu); @@ -609,16 +690,286 @@ static void cell_cpu_setup(struct op_counter_config *cntr) } } - pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled); + /* + * The pm_rtas_activate_signals will return -EIO if the FW + * call failed. + */ + return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled); +} + +#define ENTRIES 303 +#define MAXLFSR 0xFFFFFF + +/* precomputed table of 24 bit LFSR values */ +static int initial_lfsr[] = { + 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424, + 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716, + 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547, + 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392, + 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026, + 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556, + 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769, + 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893, + 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017, + 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756, + 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558, + 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401, + 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720, + 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042, + 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955, + 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934, + 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783, + 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278, + 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051, + 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741, + 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972, + 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302, + 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384, + 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469, + 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697, + 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398, + 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140, + 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214, + 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386, + 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087, + 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130, + 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300, + 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475, + 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950, + 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003, + 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375, + 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426, + 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607 +}; + +/* + * The hardware uses an LFSR counting sequence to determine when to capture + * the SPU PCs. An LFSR sequence is like a puesdo random number sequence + * where each number occurs once in the sequence but the sequence is not in + * numerical order. The SPU PC capture is done when the LFSR sequence reaches + * the last value in the sequence. Hence the user specified value N + * corresponds to the LFSR number that is N from the end of the sequence. + * + * To avoid the time to compute the LFSR, a lookup table is used. The 24 bit + * LFSR sequence is broken into four ranges. The spacing of the precomputed + * values is adjusted in each range so the error between the user specifed + * number (N) of events between samples and the actual number of events based + * on the precomputed value will be les then about 6.2%. Note, if the user + * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used. + * This is to prevent the loss of samples because the trace buffer is full. + * + * User specified N Step between Index in + * precomputed values precomputed + * table + * 0 to 2^16-1 ---- 0 + * 2^16 to 2^16+2^19-1 2^12 1 to 128 + * 2^16+2^19 to 2^16+2^19+2^22-1 2^15 129 to 256 + * 2^16+2^19+2^22 to 2^24-1 2^18 257 to 302 + * + * + * For example, the LFSR values in the second range are computed for 2^16, + * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies + * 1, 2,..., 127, 128. + * + * The 24 bit LFSR value for the nth number in the sequence can be + * calculated using the following code: + * + * #define size 24 + * int calculate_lfsr(int n) + * { + * int i; + * unsigned int newlfsr0; + * unsigned int lfsr = 0xFFFFFF; + * unsigned int howmany = n; + * + * for (i = 2; i < howmany + 2; i++) { + * newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^ + * ((lfsr >> (size - 1 - 1)) & 1) ^ + * (((lfsr >> (size - 1 - 6)) & 1) ^ + * ((lfsr >> (size - 1 - 23)) & 1))); + * + * lfsr >>= 1; + * lfsr = lfsr | (newlfsr0 << (size - 1)); + * } + * return lfsr; + * } + */ + +#define V2_16 (0x1 << 16) +#define V2_19 (0x1 << 19) +#define V2_22 (0x1 << 22) + +static int calculate_lfsr(int n) +{ + /* + * The ranges and steps are in powers of 2 so the calculations + * can be done using shifts rather then divide. + */ + int index; + + if ((n >> 16) == 0) + index = 0; + else if (((n - V2_16) >> 19) == 0) + index = ((n - V2_16) >> 12) + 1; + else if (((n - V2_16 - V2_19) >> 22) == 0) + index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128; + else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0) + index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256; + else + index = ENTRIES-1; + + /* make sure index is valid */ + if ((index > ENTRIES) || (index < 0)) + index = ENTRIES-1; + + return initial_lfsr[index]; +} + +static int pm_rtas_activate_spu_profiling(u32 node) +{ + int ret, i; + struct pm_signal pm_signal_local[NR_PHYS_CTRS]; + + /* + * Set up the rtas call to configure the debug bus to + * route the SPU PCs. Setup the pm_signal for each SPU + */ + for (i = 0; i < NUM_SPUS_PER_NODE; i++) { + pm_signal_local[i].cpu = node; + pm_signal_local[i].signal_group = 41; + /* spu i on word (i/2) */ + pm_signal_local[i].bus_word = 1 << i / 2; + /* spu i */ + pm_signal_local[i].sub_unit = i; + pm_signal_local[i].bit = 63; + } + + ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE, + PASSTHRU_ENABLE, pm_signal_local, + (NUM_SPUS_PER_NODE + * sizeof(struct pm_signal))); + + if (unlikely(ret)) { + printk(KERN_WARNING "%s: rtas returned: %d\n", + __FUNCTION__, ret); + return -EIO; + } + + return 0; +} + +#ifdef CONFIG_CPU_FREQ +static int +oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data) +{ + int ret = 0; + struct cpufreq_freqs *frq = data; + if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) || + (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) || + (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) + set_spu_profiling_frequency(frq->new, spu_cycle_reset); + return ret; +} + +static struct notifier_block cpu_freq_notifier_block = { + .notifier_call = oprof_cpufreq_notify +}; +#endif + +static int cell_global_start_spu(struct op_counter_config *ctr) +{ + int subfunc; + unsigned int lfsr_value; + int cpu; + int ret; + int rtas_error; + unsigned int cpu_khzfreq = 0; + + /* The SPU profiling uses time-based profiling based on + * cpu frequency, so if configured with the CPU_FREQ + * option, we should detect frequency changes and react + * accordingly. + */ +#ifdef CONFIG_CPU_FREQ + ret = cpufreq_register_notifier(&cpu_freq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (ret < 0) + /* this is not a fatal error */ + printk(KERN_ERR "CPU freq change registration failed: %d\n", + ret); + + else + cpu_khzfreq = cpufreq_quick_get(smp_processor_id()); +#endif + + set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset); + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + /* + * Setup SPU cycle-based profiling. + * Set perf_mon_control bit 0 to a zero before + * enabling spu collection hardware. + */ + cbe_write_pm(cpu, pm_control, 0); + + if (spu_cycle_reset > MAX_SPU_COUNT) + /* use largest possible value */ + lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1); + else + lfsr_value = calculate_lfsr(spu_cycle_reset); + + /* must use a non zero value. Zero disables data collection. */ + if (lfsr_value == 0) + lfsr_value = calculate_lfsr(1); + + lfsr_value = lfsr_value << 8; /* shift lfsr to correct + * register location + */ + + /* debug bus setup */ + ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu)); + + if (unlikely(ret)) { + rtas_error = ret; + goto out; + } + + + subfunc = 2; /* 2 - activate SPU tracing, 3 - deactivate */ + + /* start profiling */ + ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc, + cbe_cpu_to_node(cpu), lfsr_value); + + if (unlikely(ret != 0)) { + printk(KERN_ERR + "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n", + __FUNCTION__, ret); + rtas_error = -EIO; + goto out; + } + } + + rtas_error = start_spu_profiling(spu_cycle_reset); + if (rtas_error) + goto out_stop; + + oprofile_running = 1; + return 0; + +out_stop: + cell_global_stop_spu(); /* clean up the PMU/debug bus */ out: - ; + return rtas_error; } -static void cell_global_start(struct op_counter_config *ctr) +static int cell_global_start_ppu(struct op_counter_config *ctr) { - u32 cpu; + u32 cpu, i; u32 interrupt_mask = 0; - u32 i; /* This routine gets called once for the system. * There is one performance monitor per node, so we @@ -651,19 +1002,79 @@ static void cell_global_start(struct op_counter_config *ctr) oprofile_running = 1; smp_wmb(); - /* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being - * executed which manipulates the PMU. We start the "virtual counter" + /* + * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being + * executed which manipulates the PMU. We start the "virtual counter" * here so that we do not need to synchronize access to the PMU in * the above for-loop. */ start_virt_cntrs(); + + return 0; } -static void cell_global_stop(void) +static int cell_global_start(struct op_counter_config *ctr) +{ + if (spu_cycle_reset) + return cell_global_start_spu(ctr); + else + return cell_global_start_ppu(ctr); +} + +/* + * Note the generic OProfile stop calls do not support returning + * an error on stop. Hence, will not return an error if the FW + * calls fail on stop. Failure to reset the debug bus is not an issue. + * Failure to disable the SPU profiling is not an issue. The FW calls + * to enable the performance counters and debug bus will work even if + * the hardware was not cleanly reset. + */ +static void cell_global_stop_spu(void) +{ + int subfunc, rtn_value; + unsigned int lfsr_value; + int cpu; + + oprofile_running = 0; + +#ifdef CONFIG_CPU_FREQ + cpufreq_unregister_notifier(&cpu_freq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); +#endif + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + subfunc = 3; /* + * 2 - activate SPU tracing, + * 3 - deactivate + */ + lfsr_value = 0x8f100000; + + rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, + subfunc, cbe_cpu_to_node(cpu), + lfsr_value); + + if (unlikely(rtn_value != 0)) { + printk(KERN_ERR + "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n", + __FUNCTION__, rtn_value); + } + + /* Deactivate the signals */ + pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + } + + stop_spu_profiling(); +} + +static void cell_global_stop_ppu(void) { int cpu; - /* This routine will be called once for the system. + /* + * This routine will be called once for the system. * There is one performance monitor per node, so we * only need to perform this function once per node. */ @@ -687,8 +1098,16 @@ static void cell_global_stop(void) } } -static void -cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) +static void cell_global_stop(void) +{ + if (spu_cycle_reset) + cell_global_stop_spu(); + else + cell_global_stop_ppu(); +} + +static void cell_handle_interrupt(struct pt_regs *regs, + struct op_counter_config *ctr) { u32 cpu; u64 pc; @@ -699,13 +1118,15 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) cpu = smp_processor_id(); - /* Need to make sure the interrupt handler and the virt counter + /* + * Need to make sure the interrupt handler and the virt counter * routine are not running at the same time. See the * cell_virtual_cntr() routine for additional comments. */ spin_lock_irqsave(&virt_cntr_lock, flags); - /* Need to disable and reenable the performance counters + /* + * Need to disable and reenable the performance counters * to get the desired behavior from the hardware. This * is hardware specific. */ @@ -714,7 +1135,8 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu); - /* If the interrupt mask has been cleared, then the virt cntr + /* + * If the interrupt mask has been cleared, then the virt cntr * has cleared the interrupt. When the thread that generated * the interrupt is restored, the data count will be restored to * 0xffffff0 to cause the interrupt to be regenerated. @@ -732,18 +1154,20 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) } } - /* The counters were frozen by the interrupt. + /* + * The counters were frozen by the interrupt. * Reenable the interrupt and restart the counters. * If there was a race between the interrupt handler and - * the virtual counter routine. The virutal counter + * the virtual counter routine. The virutal counter * routine may have cleared the interrupts. Hence must * use the virt_cntr_inter_mask to re-enable the interrupts. */ cbe_enable_pm_interrupts(cpu, hdw_thread, virt_cntr_inter_mask); - /* The writes to the various performance counters only writes - * to a latch. The new values (interrupt setting bits, reset + /* + * The writes to the various performance counters only writes + * to a latch. The new values (interrupt setting bits, reset * counter value etc.) are not copied to the actual registers * until the performance monitor is enabled. In order to get * this to work as desired, the permormance monitor needs to @@ -755,10 +1179,33 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) spin_unlock_irqrestore(&virt_cntr_lock, flags); } +/* + * This function is called from the generic OProfile + * driver. When profiling PPUs, we need to do the + * generic sync start; otherwise, do spu_sync_start. + */ +static int cell_sync_start(void) +{ + if (spu_cycle_reset) + return spu_sync_start(); + else + return DO_GENERIC_SYNC; +} + +static int cell_sync_stop(void) +{ + if (spu_cycle_reset) + return spu_sync_stop(); + else + return 1; +} + struct op_powerpc_model op_model_cell = { .reg_setup = cell_reg_setup, .cpu_setup = cell_cpu_setup, .global_start = cell_global_start, .global_stop = cell_global_stop, + .sync_start = cell_sync_start, + .sync_stop = cell_sync_stop, .handle_interrupt = cell_handle_interrupt, }; diff --git a/arch/powerpc/oprofile/op_model_fsl_booke.c b/arch/powerpc/oprofile/op_model_fsl_booke.c index 2267eb8..183a28b 100644 --- a/arch/powerpc/oprofile/op_model_fsl_booke.c +++ b/arch/powerpc/oprofile/op_model_fsl_booke.c @@ -244,7 +244,7 @@ static void dump_pmcs(void) mfpmr(PMRN_PMLCA3), mfpmr(PMRN_PMLCB3)); } -static void fsl_booke_cpu_setup(struct op_counter_config *ctr) +static int fsl_booke_cpu_setup(struct op_counter_config *ctr) { int i; @@ -258,9 +258,11 @@ static void fsl_booke_cpu_setup(struct op_counter_config *ctr) set_pmc_user_kernel(i, ctr[i].user, ctr[i].kernel); } + + return 0; } -static void fsl_booke_reg_setup(struct op_counter_config *ctr, +static int fsl_booke_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { @@ -276,9 +278,10 @@ static void fsl_booke_reg_setup(struct op_counter_config *ctr, for (i = 0; i < num_counters; ++i) reset_value[i] = 0x80000000UL - ctr[i].count; + return 0; } -static void fsl_booke_start(struct op_counter_config *ctr) +static int fsl_booke_start(struct op_counter_config *ctr) { int i; @@ -308,6 +311,8 @@ static void fsl_booke_start(struct op_counter_config *ctr) pr_debug("start on cpu %d, pmgc0 %x\n", smp_processor_id(), mfpmr(PMRN_PMGC0)); + + return 0; } static void fsl_booke_stop(void) diff --git a/arch/powerpc/oprofile/op_model_pa6t.c b/arch/powerpc/oprofile/op_model_pa6t.c index e8a56b0..c40de46 100644 --- a/arch/powerpc/oprofile/op_model_pa6t.c +++ b/arch/powerpc/oprofile/op_model_pa6t.c @@ -89,7 +89,7 @@ static inline void ctr_write(unsigned int i, u64 val) /* precompute the values to stuff in the hardware registers */ -static void pa6t_reg_setup(struct op_counter_config *ctr, +static int pa6t_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { @@ -135,10 +135,12 @@ static void pa6t_reg_setup(struct op_counter_config *ctr, pr_debug("reset_value for pmc%u inited to 0x%lx\n", pmc, reset_value[pmc]); } + + return 0; } /* configure registers on this cpu */ -static void pa6t_cpu_setup(struct op_counter_config *ctr) +static int pa6t_cpu_setup(struct op_counter_config *ctr) { u64 mmcr0 = mmcr0_val; u64 mmcr1 = mmcr1_val; @@ -154,9 +156,11 @@ static void pa6t_cpu_setup(struct op_counter_config *ctr) mfspr(SPRN_PA6T_MMCR0)); pr_debug("setup on cpu %d, mmcr1 %016lx\n", smp_processor_id(), mfspr(SPRN_PA6T_MMCR1)); + + return 0; } -static void pa6t_start(struct op_counter_config *ctr) +static int pa6t_start(struct op_counter_config *ctr) { int i; @@ -174,6 +178,8 @@ static void pa6t_start(struct op_counter_config *ctr) oprofile_running = 1; pr_debug("start on cpu %d, mmcr0 %lx\n", smp_processor_id(), mmcr0); + + return 0; } static void pa6t_stop(void) diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c index a7c206b..cddc250 100644 --- a/arch/powerpc/oprofile/op_model_power4.c +++ b/arch/powerpc/oprofile/op_model_power4.c @@ -32,7 +32,7 @@ static u32 mmcr0_val; static u64 mmcr1_val; static u64 mmcra_val; -static void power4_reg_setup(struct op_counter_config *ctr, +static int power4_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { @@ -60,6 +60,8 @@ static void power4_reg_setup(struct op_counter_config *ctr, mmcr0_val &= ~MMCR0_PROBLEM_DISABLE; else mmcr0_val |= MMCR0_PROBLEM_DISABLE; + + return 0; } extern void ppc64_enable_pmcs(void); @@ -84,7 +86,7 @@ static inline int mmcra_must_set_sample(void) return 0; } -static void power4_cpu_setup(struct op_counter_config *ctr) +static int power4_cpu_setup(struct op_counter_config *ctr) { unsigned int mmcr0 = mmcr0_val; unsigned long mmcra = mmcra_val; @@ -111,9 +113,11 @@ static void power4_cpu_setup(struct op_counter_config *ctr) mfspr(SPRN_MMCR1)); dbg("setup on cpu %d, mmcra %lx\n", smp_processor_id(), mfspr(SPRN_MMCRA)); + + return 0; } -static void power4_start(struct op_counter_config *ctr) +static int power4_start(struct op_counter_config *ctr) { int i; unsigned int mmcr0; @@ -148,6 +152,7 @@ static void power4_start(struct op_counter_config *ctr) oprofile_running = 1; dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0); + return 0; } static void power4_stop(void) diff --git a/arch/powerpc/oprofile/op_model_rs64.c b/arch/powerpc/oprofile/op_model_rs64.c index c731acb..a20afe4 100644 --- a/arch/powerpc/oprofile/op_model_rs64.c +++ b/arch/powerpc/oprofile/op_model_rs64.c @@ -88,7 +88,7 @@ static unsigned long reset_value[OP_MAX_COUNTER]; static int num_counters; -static void rs64_reg_setup(struct op_counter_config *ctr, +static int rs64_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { @@ -100,9 +100,10 @@ static void rs64_reg_setup(struct op_counter_config *ctr, reset_value[i] = 0x80000000UL - ctr[i].count; /* XXX setup user and kernel profiling */ + return 0; } -static void rs64_cpu_setup(struct op_counter_config *ctr) +static int rs64_cpu_setup(struct op_counter_config *ctr) { unsigned int mmcr0; @@ -125,9 +126,11 @@ static void rs64_cpu_setup(struct op_counter_config *ctr) mfspr(SPRN_MMCR0)); dbg("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(), mfspr(SPRN_MMCR1)); + + return 0; } -static void rs64_start(struct op_counter_config *ctr) +static int rs64_start(struct op_counter_config *ctr) { int i; unsigned int mmcr0; @@ -155,6 +158,7 @@ static void rs64_start(struct op_counter_config *ctr) mtspr(SPRN_MMCR0, mmcr0); dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0); + return 0; } static void rs64_stop(void) diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 33545d3..932538a 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -272,4 +272,14 @@ config CPM2 you wish to build a kernel for a machine with a CPM2 coprocessor on it (826x, 827x, 8560). +config AXON_RAM + tristate "Axon DDR2 memory device driver" + depends on PPC_IBM_CELL_BLADE + default m + help + It registers one block device per Axon's DDR2 memory bank found + on a system. Block devices are called axonram?, their major and + minor numbers are available in /proc/devices, /proc/partitions or + in /sys/block/axonram?/dev. + endmenu diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index 9b2b386..ac80320 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -73,4 +73,14 @@ config CBE_CPUFREQ For details, take a look at <file:Documentation/cpu-freq/>. If you don't have such processor, say N +config CBE_CPUFREQ_PMI + tristate "CBE frequency scaling using PMI interface" + depends on CBE_CPUFREQ && PPC_PMI && EXPERIMENTAL + default n + help + Select this, if you want to use the PMI interface + to switch frequencies. Using PMI, the + processor will not only be able to run at lower speed, + but also at lower core voltage. + endmenu diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile index 869af89..f88a7c7 100644 --- a/arch/powerpc/platforms/cell/Makefile +++ b/arch/powerpc/platforms/cell/Makefile @@ -4,7 +4,9 @@ obj-$(CONFIG_PPC_CELL_NATIVE) += interrupt.o iommu.o setup.o \ obj-$(CONFIG_CBE_RAS) += ras.o obj-$(CONFIG_CBE_THERM) += cbe_thermal.o -obj-$(CONFIG_CBE_CPUFREQ) += cbe_cpufreq.o +obj-$(CONFIG_CBE_CPUFREQ_PMI) += cbe_cpufreq_pmi.o +obj-$(CONFIG_CBE_CPUFREQ) += cbe-cpufreq.o +cbe-cpufreq-y += cbe_cpufreq_pervasive.o cbe_cpufreq.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_PPC_CELL_NATIVE) += smp.o @@ -23,3 +25,5 @@ obj-$(CONFIG_SPU_BASE) += spu_callbacks.o spu_base.o \ $(spu-priv1-y) \ $(spu-manage-y) \ spufs/ + +obj-$(CONFIG_PCI_MSI) += axon_msi.o diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c new file mode 100644 index 0000000..4c9ab5b --- /dev/null +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -0,0 +1,445 @@ +/* + * Copyright 2007, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/msi.h> +#include <linux/reboot.h> + +#include <asm/dcr.h> +#include <asm/machdep.h> +#include <asm/prom.h> + + +/* + * MSIC registers, specified as offsets from dcr_base + */ +#define MSIC_CTRL_REG 0x0 + +/* Base Address registers specify FIFO location in BE memory */ +#define MSIC_BASE_ADDR_HI_REG 0x3 +#define MSIC_BASE_ADDR_LO_REG 0x4 + +/* Hold the read/write offsets into the FIFO */ +#define MSIC_READ_OFFSET_REG 0x5 +#define MSIC_WRITE_OFFSET_REG 0x6 + + +/* MSIC control register flags */ +#define MSIC_CTRL_ENABLE 0x0001 +#define MSIC_CTRL_FIFO_FULL_ENABLE 0x0002 +#define MSIC_CTRL_IRQ_ENABLE 0x0008 +#define MSIC_CTRL_FULL_STOP_ENABLE 0x0010 + +/* + * The MSIC can be configured to use a FIFO of 32KB, 64KB, 128KB or 256KB. + * Currently we're using a 64KB FIFO size. + */ +#define MSIC_FIFO_SIZE_SHIFT 16 +#define MSIC_FIFO_SIZE_BYTES (1 << MSIC_FIFO_SIZE_SHIFT) + +/* + * To configure the FIFO size as (1 << n) bytes, we write (n - 15) into bits + * 8-9 of the MSIC control reg. + */ +#define MSIC_CTRL_FIFO_SIZE (((MSIC_FIFO_SIZE_SHIFT - 15) << 8) & 0x300) + +/* + * We need to mask the read/write offsets to make sure they stay within + * the bounds of the FIFO. Also they should always be 16-byte aligned. + */ +#define MSIC_FIFO_SIZE_MASK ((MSIC_FIFO_SIZE_BYTES - 1) & ~0xFu) + +/* Each entry in the FIFO is 16 bytes, the first 4 bytes hold the irq # */ +#define MSIC_FIFO_ENTRY_SIZE 0x10 + + +struct axon_msic { + struct device_node *dn; + struct irq_host *irq_host; + __le32 *fifo; + dcr_host_t dcr_host; + struct list_head list; + u32 read_offset; + u32 dcr_base; +}; + +static LIST_HEAD(axon_msic_list); + +static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val) +{ + pr_debug("axon_msi: dcr_write(0x%x, 0x%x)\n", val, dcr_n); + + dcr_write(msic->dcr_host, msic->dcr_base + dcr_n, val); +} + +static u32 msic_dcr_read(struct axon_msic *msic, unsigned int dcr_n) +{ + return dcr_read(msic->dcr_host, msic->dcr_base + dcr_n); +} + +static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc) +{ + struct axon_msic *msic = get_irq_data(irq); + u32 write_offset, msi; + int idx; + + write_offset = msic_dcr_read(msic, MSIC_WRITE_OFFSET_REG); + pr_debug("axon_msi: original write_offset 0x%x\n", write_offset); + + /* write_offset doesn't wrap properly, so we have to mask it */ + write_offset &= MSIC_FIFO_SIZE_MASK; + + while (msic->read_offset != write_offset) { + idx = msic->read_offset / sizeof(__le32); + msi = le32_to_cpu(msic->fifo[idx]); + msi &= 0xFFFF; + + pr_debug("axon_msi: woff %x roff %x msi %x\n", + write_offset, msic->read_offset, msi); + + msic->read_offset += MSIC_FIFO_ENTRY_SIZE; + msic->read_offset &= MSIC_FIFO_SIZE_MASK; + + if (msi < NR_IRQS && irq_map[msi].host == msic->irq_host) + generic_handle_irq(msi); + else + pr_debug("axon_msi: invalid irq 0x%x!\n", msi); + } + + desc->chip->eoi(irq); +} + +static struct axon_msic *find_msi_translator(struct pci_dev *dev) +{ + struct irq_host *irq_host; + struct device_node *dn, *tmp; + const phandle *ph; + struct axon_msic *msic = NULL; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n"); + return NULL; + } + + for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) { + ph = of_get_property(dn, "msi-translator", NULL); + if (ph) + break; + } + + if (!ph) { + dev_dbg(&dev->dev, + "axon_msi: no msi-translator property found\n"); + goto out_error; + } + + tmp = dn; + dn = of_find_node_by_phandle(*ph); + if (!dn) { + dev_dbg(&dev->dev, + "axon_msi: msi-translator doesn't point to a node\n"); + goto out_error; + } + + irq_host = irq_find_host(dn); + if (!irq_host) { + dev_dbg(&dev->dev, "axon_msi: no irq_host found for node %s\n", + dn->full_name); + goto out_error; + } + + msic = irq_host->host_data; + +out_error: + of_node_put(dn); + of_node_put(tmp); + + return msic; +} + +static int axon_msi_check_device(struct pci_dev *dev, int nvec, int type) +{ + if (!find_msi_translator(dev)) + return -ENODEV; + + return 0; +} + +static int setup_msi_msg_address(struct pci_dev *dev, struct msi_msg *msg) +{ + struct device_node *dn, *tmp; + struct msi_desc *entry; + int len; + const u32 *prop; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n"); + return -ENODEV; + } + + entry = list_first_entry(&dev->msi_list, struct msi_desc, list); + + for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) { + if (entry->msi_attrib.is_64) { + prop = of_get_property(dn, "msi-address-64", &len); + if (prop) + break; + } + + prop = of_get_property(dn, "msi-address-32", &len); + if (prop) + break; + } + + if (!prop) { + dev_dbg(&dev->dev, + "axon_msi: no msi-address-(32|64) properties found\n"); + return -ENOENT; + } + + switch (len) { + case 8: + msg->address_hi = prop[0]; + msg->address_lo = prop[1]; + break; + case 4: + msg->address_hi = 0; + msg->address_lo = prop[0]; + break; + default: + dev_dbg(&dev->dev, + "axon_msi: malformed msi-address-(32|64) property\n"); + of_node_put(dn); + return -EINVAL; + } + + of_node_put(dn); + + return 0; +} + +static int axon_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int virq, rc; + struct msi_desc *entry; + struct msi_msg msg; + struct axon_msic *msic; + + msic = find_msi_translator(dev); + if (!msic) + return -ENODEV; + + rc = setup_msi_msg_address(dev, &msg); + if (rc) + return rc; + + /* We rely on being able to stash a virq in a u16 */ + BUILD_BUG_ON(NR_IRQS > 65536); + + list_for_each_entry(entry, &dev->msi_list, list) { + virq = irq_create_direct_mapping(msic->irq_host); + if (virq == NO_IRQ) { + dev_warn(&dev->dev, + "axon_msi: virq allocation failed!\n"); + return -1; + } + dev_dbg(&dev->dev, "axon_msi: allocated virq 0x%x\n", virq); + + set_irq_msi(virq, entry); + msg.data = virq; + write_msi_msg(virq, &msg); + } + + return 0; +} + +static void axon_msi_teardown_msi_irqs(struct pci_dev *dev) +{ + struct msi_desc *entry; + + dev_dbg(&dev->dev, "axon_msi: tearing down msi irqs\n"); + + list_for_each_entry(entry, &dev->msi_list, list) { + if (entry->irq == NO_IRQ) + continue; + + set_irq_msi(entry->irq, NULL); + irq_dispose_mapping(entry->irq); + } +} + +static struct irq_chip msic_irq_chip = { + .mask = mask_msi_irq, + .unmask = unmask_msi_irq, + .shutdown = unmask_msi_irq, + .typename = "AXON-MSI", +}; + +static int msic_host_map(struct irq_host *h, unsigned int virq, + irq_hw_number_t hw) +{ + set_irq_chip_and_handler(virq, &msic_irq_chip, handle_simple_irq); + + return 0; +} + +static int msic_host_match(struct irq_host *host, struct device_node *dn) +{ + struct axon_msic *msic = host->host_data; + + return msic->dn == dn; +} + +static struct irq_host_ops msic_host_ops = { + .match = msic_host_match, + .map = msic_host_map, +}; + +static int axon_msi_notify_reboot(struct notifier_block *nb, + unsigned long code, void *data) +{ + struct axon_msic *msic; + u32 tmp; + + list_for_each_entry(msic, &axon_msic_list, list) { + pr_debug("axon_msi: disabling %s\n", msic->dn->full_name); + tmp = msic_dcr_read(msic, MSIC_CTRL_REG); + tmp &= ~MSIC_CTRL_ENABLE & ~MSIC_CTRL_IRQ_ENABLE; + msic_dcr_write(msic, MSIC_CTRL_REG, tmp); + } + + return 0; +} + +static struct notifier_block axon_msi_reboot_notifier = { + .notifier_call = axon_msi_notify_reboot +}; + +static int axon_msi_setup_one(struct device_node *dn) +{ + struct page *page; + struct axon_msic *msic; + unsigned int virq; + int dcr_len; + + pr_debug("axon_msi: setting up dn %s\n", dn->full_name); + + msic = kzalloc(sizeof(struct axon_msic), GFP_KERNEL); + if (!msic) { + printk(KERN_ERR "axon_msi: couldn't allocate msic for %s\n", + dn->full_name); + goto out; + } + + msic->dcr_base = dcr_resource_start(dn, 0); + dcr_len = dcr_resource_len(dn, 0); + + if (msic->dcr_base == 0 || dcr_len == 0) { + printk(KERN_ERR + "axon_msi: couldn't parse dcr properties on %s\n", + dn->full_name); + goto out; + } + + msic->dcr_host = dcr_map(dn, msic->dcr_base, dcr_len); + if (!DCR_MAP_OK(msic->dcr_host)) { + printk(KERN_ERR "axon_msi: dcr_map failed for %s\n", + dn->full_name); + goto out_free_msic; + } + + page = alloc_pages_node(of_node_to_nid(dn), GFP_KERNEL, + get_order(MSIC_FIFO_SIZE_BYTES)); + if (!page) { + printk(KERN_ERR "axon_msi: couldn't allocate fifo for %s\n", + dn->full_name); + goto out_free_msic; + } + + msic->fifo = page_address(page); + + msic->irq_host = irq_alloc_host(IRQ_HOST_MAP_NOMAP, NR_IRQS, + &msic_host_ops, 0); + if (!msic->irq_host) { + printk(KERN_ERR "axon_msi: couldn't allocate irq_host for %s\n", + dn->full_name); + goto out_free_fifo; + } + + msic->irq_host->host_data = msic; + + virq = irq_of_parse_and_map(dn, 0); + if (virq == NO_IRQ) { + printk(KERN_ERR "axon_msi: irq parse and map failed for %s\n", + dn->full_name); + goto out_free_host; + } + + msic->dn = of_node_get(dn); + + set_irq_data(virq, msic); + set_irq_chained_handler(virq, axon_msi_cascade); + pr_debug("axon_msi: irq 0x%x setup for axon_msi\n", virq); + + /* Enable the MSIC hardware */ + msic_dcr_write(msic, MSIC_BASE_ADDR_HI_REG, (u64)msic->fifo >> 32); + msic_dcr_write(msic, MSIC_BASE_ADDR_LO_REG, + (u64)msic->fifo & 0xFFFFFFFF); + msic_dcr_write(msic, MSIC_CTRL_REG, + MSIC_CTRL_IRQ_ENABLE | MSIC_CTRL_ENABLE | + MSIC_CTRL_FIFO_SIZE); + + list_add(&msic->list, &axon_msic_list); + + printk(KERN_DEBUG "axon_msi: setup MSIC on %s\n", dn->full_name); + + return 0; + +out_free_host: + kfree(msic->irq_host); +out_free_fifo: + __free_pages(virt_to_page(msic->fifo), get_order(MSIC_FIFO_SIZE_BYTES)); +out_free_msic: + kfree(msic); +out: + + return -1; +} + +static int axon_msi_init(void) +{ + struct device_node *dn; + int found = 0; + + pr_debug("axon_msi: initialising ...\n"); + + for_each_compatible_node(dn, NULL, "ibm,axon-msic") { + if (axon_msi_setup_one(dn) == 0) + found++; + } + + if (found) { + ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs; + ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs; + ppc_md.msi_check_device = axon_msi_check_device; + + register_reboot_notifier(&axon_msi_reboot_notifier); + + pr_debug("axon_msi: registered callbacks!\n"); + } + + return 0; +} +arch_initcall(axon_msi_init); diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c index ab511d5..0b6e8ee 100644 --- a/arch/powerpc/platforms/cell/cbe_cpufreq.c +++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c @@ -1,7 +1,7 @@ /* * cpufreq driver for the cell processor * - * (C) Copyright IBM Deutschland Entwicklung GmbH 2005 + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007 * * Author: Christian Krafft <krafft@de.ibm.com> * @@ -21,18 +21,11 @@ */ #include <linux/cpufreq.h> -#include <linux/timer.h> - -#include <asm/hw_irq.h> -#include <asm/io.h> #include <asm/machdep.h> -#include <asm/processor.h> -#include <asm/prom.h> -#include <asm/time.h> -#include <asm/pmi.h> #include <asm/of_platform.h> - +#include <asm/prom.h> #include "cbe_regs.h" +#include "cbe_cpufreq.h" static DEFINE_MUTEX(cbe_switch_mutex); @@ -50,159 +43,24 @@ static struct cpufreq_frequency_table cbe_freqs[] = { {0, CPUFREQ_TABLE_END}, }; -/* to write to MIC register */ -static u64 MIC_Slow_Fast_Timer_table[] = { - [0 ... 7] = 0x007fc00000000000ull, -}; - -/* more values for the MIC */ -static u64 MIC_Slow_Next_Timer_table[] = { - 0x0000240000000000ull, - 0x0000268000000000ull, - 0x000029C000000000ull, - 0x00002D0000000000ull, - 0x0000300000000000ull, - 0x0000334000000000ull, - 0x000039C000000000ull, - 0x00003FC000000000ull, -}; - -static unsigned int pmi_frequency_limit = 0; /* * hardware specific functions */ -static struct of_device *pmi_dev; - -#ifdef CONFIG_PPC_PMI -static int set_pmode_pmi(int cpu, unsigned int pmode) -{ - int ret; - pmi_message_t pmi_msg; -#ifdef DEBUG - u64 time; -#endif - - pmi_msg.type = PMI_TYPE_FREQ_CHANGE; - pmi_msg.data1 = cbe_cpu_to_node(cpu); - pmi_msg.data2 = pmode; - -#ifdef DEBUG - time = (u64) get_cycles(); -#endif - - pmi_send_message(pmi_dev, pmi_msg); - ret = pmi_msg.data2; - - pr_debug("PMI returned slow mode %d\n", ret); - -#ifdef DEBUG - time = (u64) get_cycles() - time; /* actual cycles (not cpu cycles!) */ - time = 1000000000 * time / CLOCK_TICK_RATE; /* time in ns (10^-9) */ - pr_debug("had to wait %lu ns for a transition\n", time); -#endif - return ret; -} -#endif - -static int get_pmode(int cpu) +static int set_pmode(unsigned int cpu, unsigned int slow_mode) { - int ret; - struct cbe_pmd_regs __iomem *pmd_regs; - - pmd_regs = cbe_get_cpu_pmd_regs(cpu); - ret = in_be64(&pmd_regs->pmsr) & 0x07; - - return ret; -} - -static int set_pmode_reg(int cpu, unsigned int pmode) -{ - struct cbe_pmd_regs __iomem *pmd_regs; - struct cbe_mic_tm_regs __iomem *mic_tm_regs; - u64 flags; - u64 value; - - local_irq_save(flags); - - mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu); - pmd_regs = cbe_get_cpu_pmd_regs(cpu); - - pr_debug("pm register is mapped at %p\n", &pmd_regs->pmcr); - pr_debug("mic register is mapped at %p\n", &mic_tm_regs->slow_fast_timer_0); - - out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]); - out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]); - - out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]); - out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]); - - value = in_be64(&pmd_regs->pmcr); - /* set bits to zero */ - value &= 0xFFFFFFFFFFFFFFF8ull; - /* set bits to next pmode */ - value |= pmode; - - out_be64(&pmd_regs->pmcr, value); - - /* wait until new pmode appears in status register */ - value = in_be64(&pmd_regs->pmsr) & 0x07; - while(value != pmode) { - cpu_relax(); - value = in_be64(&pmd_regs->pmsr) & 0x07; - } - - local_irq_restore(flags); - - return 0; -} + int rc; -static int set_pmode(int cpu, unsigned int slow_mode) { -#ifdef CONFIG_PPC_PMI - if (pmi_dev) - return set_pmode_pmi(cpu, slow_mode); + if (cbe_cpufreq_has_pmi) + rc = cbe_cpufreq_set_pmode_pmi(cpu, slow_mode); else -#endif - return set_pmode_reg(cpu, slow_mode); -} - -static void cbe_cpufreq_handle_pmi(struct of_device *dev, pmi_message_t pmi_msg) -{ - u8 cpu; - u8 cbe_pmode_new; - - BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE); + rc = cbe_cpufreq_set_pmode(cpu, slow_mode); - cpu = cbe_node_to_cpu(pmi_msg.data1); - cbe_pmode_new = pmi_msg.data2; + pr_debug("register contains slow mode %d\n", cbe_cpufreq_get_pmode(cpu)); - pmi_frequency_limit = cbe_freqs[cbe_pmode_new].frequency; - - pr_debug("cbe_handle_pmi: max freq=%d\n", pmi_frequency_limit); -} - -static int pmi_notifier(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct cpufreq_policy *policy = data; - - if (event != CPUFREQ_INCOMPATIBLE) - return 0; - - cpufreq_verify_within_limits(policy, 0, pmi_frequency_limit); - return 0; + return rc; } -static struct notifier_block pmi_notifier_block = { - .notifier_call = pmi_notifier, -}; - -static struct pmi_handler cbe_pmi_handler = { - .type = PMI_TYPE_FREQ_CHANGE, - .handle_pmi_message = cbe_cpufreq_handle_pmi, -}; - - /* * cpufreq functions */ @@ -221,8 +79,19 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy) pr_debug("init cpufreq on CPU %d\n", policy->cpu); + /* + * Let's check we can actually get to the CELL regs + */ + if (!cbe_get_cpu_pmd_regs(policy->cpu) || + !cbe_get_cpu_mic_tm_regs(policy->cpu)) { + pr_info("invalid CBE regs pointers for cpufreq\n"); + return -EINVAL; + } + max_freqp = of_get_property(cpu, "clock-frequency", NULL); + of_node_put(cpu); + if (!max_freqp) return -EINVAL; @@ -239,10 +108,12 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy) } policy->governor = CPUFREQ_DEFAULT_GOVERNOR; - /* if DEBUG is enabled set_pmode() measures the correct latency of a transition */ + + /* if DEBUG is enabled set_pmode() measures the latency + * of a transition */ policy->cpuinfo.transition_latency = 25000; - cur_pmode = get_pmode(policy->cpu); + cur_pmode = cbe_cpufreq_get_pmode(policy->cpu); pr_debug("current pmode is at %d\n",cur_pmode); policy->cur = cbe_freqs[cur_pmode].frequency; @@ -253,21 +124,13 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy) cpufreq_frequency_table_get_attr(cbe_freqs, policy->cpu); - if (pmi_dev) { - /* frequency might get limited later, initialize limit with max_freq */ - pmi_frequency_limit = max_freq; - cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER); - } - - /* this ensures that policy->cpuinfo_min and policy->cpuinfo_max are set correctly */ + /* this ensures that policy->cpuinfo_min + * and policy->cpuinfo_max are set correctly */ return cpufreq_frequency_table_cpuinfo(policy, cbe_freqs); } static int cbe_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - if (pmi_dev) - cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER); - cpufreq_frequency_table_put_attr(policy->cpu); return 0; } @@ -277,13 +140,13 @@ static int cbe_cpufreq_verify(struct cpufreq_policy *policy) return cpufreq_frequency_table_verify(policy, cbe_freqs); } - -static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, - unsigned int relation) +static int cbe_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) { int rc; struct cpufreq_freqs freqs; - int cbe_pmode_new; + unsigned int cbe_pmode_new; cpufreq_frequency_table_target(policy, cbe_freqs, @@ -298,12 +161,14 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target mutex_lock(&cbe_switch_mutex); cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n", + pr_debug("setting frequency for cpu %d to %d kHz, " \ + "1/%d of max frequency\n", policy->cpu, cbe_freqs[cbe_pmode_new].frequency, cbe_freqs[cbe_pmode_new].index); rc = set_pmode(policy->cpu, cbe_pmode_new); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); mutex_unlock(&cbe_switch_mutex); @@ -326,28 +191,14 @@ static struct cpufreq_driver cbe_cpufreq_driver = { static int __init cbe_cpufreq_init(void) { -#ifdef CONFIG_PPC_PMI - struct device_node *np; -#endif if (!machine_is(cell)) return -ENODEV; -#ifdef CONFIG_PPC_PMI - np = of_find_node_by_type(NULL, "ibm,pmi"); - - pmi_dev = of_find_device_by_node(np); - if (pmi_dev) - pmi_register_handler(pmi_dev, &cbe_pmi_handler); -#endif return cpufreq_register_driver(&cbe_cpufreq_driver); } static void __exit cbe_cpufreq_exit(void) { -#ifdef CONFIG_PPC_PMI - if (pmi_dev) - pmi_unregister_handler(pmi_dev, &cbe_pmi_handler); -#endif cpufreq_unregister_driver(&cbe_cpufreq_driver); } diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.h b/arch/powerpc/platforms/cell/cbe_cpufreq.h new file mode 100644 index 0000000..c1d86bf --- /dev/null +++ b/arch/powerpc/platforms/cell/cbe_cpufreq.h @@ -0,0 +1,24 @@ +/* + * cbe_cpufreq.h + * + * This file contains the definitions used by the cbe_cpufreq driver. + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007 + * + * Author: Christian Krafft <krafft@de.ibm.com> + * + */ + +#include <linux/cpufreq.h> +#include <linux/types.h> + +int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode); +int cbe_cpufreq_get_pmode(int cpu); + +int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode); + +#if defined(CONFIG_CBE_CPUFREQ_PMI) || defined(CONFIG_CBE_CPUFREQ_PMI_MODULE) +extern bool cbe_cpufreq_has_pmi; +#else +#define cbe_cpufreq_has_pmi (0) +#endif diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c new file mode 100644 index 0000000..163263b --- /dev/null +++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c @@ -0,0 +1,115 @@ +/* + * pervasive backend for the cbe_cpufreq driver + * + * This driver makes use of the pervasive unit to + * engage the desired frequency. + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007 + * + * Author: Christian Krafft <krafft@de.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/time.h> +#include <asm/machdep.h> +#include <asm/hw_irq.h> + +#include "cbe_regs.h" +#include "cbe_cpufreq.h" + +/* to write to MIC register */ +static u64 MIC_Slow_Fast_Timer_table[] = { + [0 ... 7] = 0x007fc00000000000ull, +}; + +/* more values for the MIC */ +static u64 MIC_Slow_Next_Timer_table[] = { + 0x0000240000000000ull, + 0x0000268000000000ull, + 0x000029C000000000ull, + 0x00002D0000000000ull, + 0x0000300000000000ull, + 0x0000334000000000ull, + 0x000039C000000000ull, + 0x00003FC000000000ull, +}; + + +int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode) +{ + struct cbe_pmd_regs __iomem *pmd_regs; + struct cbe_mic_tm_regs __iomem *mic_tm_regs; + u64 flags; + u64 value; +#ifdef DEBUG + long time; +#endif + + local_irq_save(flags); + + mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu); + pmd_regs = cbe_get_cpu_pmd_regs(cpu); + +#ifdef DEBUG + time = jiffies; +#endif + + out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]); + out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]); + + out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]); + out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]); + + value = in_be64(&pmd_regs->pmcr); + /* set bits to zero */ + value &= 0xFFFFFFFFFFFFFFF8ull; + /* set bits to next pmode */ + value |= pmode; + + out_be64(&pmd_regs->pmcr, value); + +#ifdef DEBUG + /* wait until new pmode appears in status register */ + value = in_be64(&pmd_regs->pmsr) & 0x07; + while (value != pmode) { + cpu_relax(); + value = in_be64(&pmd_regs->pmsr) & 0x07; + } + + time = jiffies - time; + time = jiffies_to_msecs(time); + pr_debug("had to wait %lu ms for a transition using " \ + "pervasive unit\n", time); +#endif + local_irq_restore(flags); + + return 0; +} + + +int cbe_cpufreq_get_pmode(int cpu) +{ + int ret; + struct cbe_pmd_regs __iomem *pmd_regs; + + pmd_regs = cbe_get_cpu_pmd_regs(cpu); + ret = in_be64(&pmd_regs->pmsr) & 0x07; + + return ret; +} + diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c new file mode 100644 index 0000000..fc6f389 --- /dev/null +++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c @@ -0,0 +1,148 @@ +/* + * pmi backend for the cbe_cpufreq driver + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007 + * + * Author: Christian Krafft <krafft@de.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <asm/of_platform.h> +#include <asm/processor.h> +#include <asm/prom.h> +#include <asm/pmi.h> + +#ifdef DEBUG +#include <asm/time.h> +#endif + +#include "cbe_regs.h" +#include "cbe_cpufreq.h" + +static u8 pmi_slow_mode_limit[MAX_CBE]; + +bool cbe_cpufreq_has_pmi = false; +EXPORT_SYMBOL_GPL(cbe_cpufreq_has_pmi); + +/* + * hardware specific functions + */ + +int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode) +{ + int ret; + pmi_message_t pmi_msg; +#ifdef DEBUG + long time; +#endif + pmi_msg.type = PMI_TYPE_FREQ_CHANGE; + pmi_msg.data1 = cbe_cpu_to_node(cpu); + pmi_msg.data2 = pmode; + +#ifdef DEBUG + time = jiffies; +#endif + pmi_send_message(pmi_msg); + +#ifdef DEBUG + time = jiffies - time; + time = jiffies_to_msecs(time); + pr_debug("had to wait %lu ms for a transition using " \ + "PMI\n", time); +#endif + ret = pmi_msg.data2; + pr_debug("PMI returned slow mode %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(cbe_cpufreq_set_pmode_pmi); + + +static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg) +{ + u8 node, slow_mode; + + BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE); + + node = pmi_msg.data1; + slow_mode = pmi_msg.data2; + + pmi_slow_mode_limit[node] = slow_mode; + + pr_debug("cbe_handle_pmi: node: %d max_freq: %d\n", node, slow_mode); +} + +static int pmi_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_frequency_table *cbe_freqs; + u8 node; + + cbe_freqs = cpufreq_frequency_get_table(policy->cpu); + node = cbe_cpu_to_node(policy->cpu); + + pr_debug("got notified, event=%lu, node=%u\n", event, node); + + if (pmi_slow_mode_limit[node] != 0) { + pr_debug("limiting node %d to slow mode %d\n", + node, pmi_slow_mode_limit[node]); + + cpufreq_verify_within_limits(policy, 0, + + cbe_freqs[pmi_slow_mode_limit[node]].frequency); + } + + return 0; +} + +static struct notifier_block pmi_notifier_block = { + .notifier_call = pmi_notifier, +}; + +static struct pmi_handler cbe_pmi_handler = { + .type = PMI_TYPE_FREQ_CHANGE, + .handle_pmi_message = cbe_cpufreq_handle_pmi, +}; + + + +static int __init cbe_cpufreq_pmi_init(void) +{ + cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0; + + if (!cbe_cpufreq_has_pmi) + return -ENODEV; + + cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER); + + return 0; +} + +static void __exit cbe_cpufreq_pmi_exit(void) +{ + cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER); + pmi_unregister_handler(&cbe_pmi_handler); +} + +module_init(cbe_cpufreq_pmi_init); +module_exit(cbe_cpufreq_pmi_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>"); diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c index 12c9674..c8f7f00 100644 --- a/arch/powerpc/platforms/cell/cbe_regs.c +++ b/arch/powerpc/platforms/cell/cbe_regs.c @@ -174,6 +174,13 @@ static struct device_node *cbe_get_be_node(int cpu_id) cpu_handle = of_get_property(np, "cpus", &len); + /* + * the CAB SLOF tree is non compliant, so we just assume + * there is only one node + */ + if (WARN_ON_ONCE(!cpu_handle)) + return np; + for (i=0; i<len; i++) if (of_find_node_by_phandle(cpu_handle[i]) == of_get_cpu_node(cpu_id, NULL)) return np; diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c index f370f0f..e4132f8 100644 --- a/arch/powerpc/platforms/cell/cbe_thermal.c +++ b/arch/powerpc/platforms/cell/cbe_thermal.c @@ -292,7 +292,7 @@ static struct attribute_group ppe_attribute_group = { /* * initialize throttling with default values */ -static void __init init_default_values(void) +static int __init init_default_values(void) { int cpu; struct cbe_pmd_regs __iomem *pmd_regs; @@ -339,25 +339,40 @@ static void __init init_default_values(void) for_each_possible_cpu (cpu) { pr_debug("processing cpu %d\n", cpu); sysdev = get_cpu_sysdev(cpu); + + if (!sysdev) { + pr_info("invalid sysdev pointer for cbe_thermal\n"); + return -EINVAL; + } + pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id); + if (!pmd_regs) { + pr_info("invalid CBE regs pointer for cbe_thermal\n"); + return -EINVAL; + } + out_be64(&pmd_regs->tm_str2, str2); out_be64(&pmd_regs->tm_str1.val, str1.val); out_be64(&pmd_regs->tm_tpr.val, tpr.val); out_be64(&pmd_regs->tm_cr1.val, cr1.val); out_be64(&pmd_regs->tm_cr2, cr2); } + + return 0; } static int __init thermal_init(void) { - init_default_values(); + int rc = init_default_values(); - spu_add_sysdev_attr_group(&spu_attribute_group); - cpu_add_sysdev_attr_group(&ppe_attribute_group); + if (rc == 0) { + spu_add_sysdev_attr_group(&spu_attribute_group); + cpu_add_sysdev_attr_group(&ppe_attribute_group); + } - return 0; + return rc; } module_init(thermal_init); diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 96a8f60..9012422 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -35,18 +35,37 @@ #include <asm/spu.h> #include <asm/spu_priv1.h> #include <asm/xmon.h> +#include <asm/prom.h> +#include "spu_priv1_mmio.h" const struct spu_management_ops *spu_management_ops; EXPORT_SYMBOL_GPL(spu_management_ops); const struct spu_priv1_ops *spu_priv1_ops; +EXPORT_SYMBOL_GPL(spu_priv1_ops); -static struct list_head spu_list[MAX_NUMNODES]; -static LIST_HEAD(spu_full_list); -static DEFINE_MUTEX(spu_mutex); -static DEFINE_SPINLOCK(spu_list_lock); +struct cbe_spu_info cbe_spu_info[MAX_NUMNODES]; +EXPORT_SYMBOL_GPL(cbe_spu_info); -EXPORT_SYMBOL_GPL(spu_priv1_ops); +/* + * Protects cbe_spu_info and spu->number. + */ +static DEFINE_SPINLOCK(spu_lock); + +/* + * List of all spus in the system. + * + * This list is iterated by callers from irq context and callers that + * want to sleep. Thus modifications need to be done with both + * spu_full_list_lock and spu_full_list_mutex held, while iterating + * through it requires either of these locks. + * + * In addition spu_full_list_lock protects all assignmens to + * spu->mm. + */ +static LIST_HEAD(spu_full_list); +static DEFINE_SPINLOCK(spu_full_list_lock); +static DEFINE_MUTEX(spu_full_list_mutex); void spu_invalidate_slbs(struct spu *spu) { @@ -65,12 +84,12 @@ void spu_flush_all_slbs(struct mm_struct *mm) struct spu *spu; unsigned long flags; - spin_lock_irqsave(&spu_list_lock, flags); + spin_lock_irqsave(&spu_full_list_lock, flags); list_for_each_entry(spu, &spu_full_list, full_list) { if (spu->mm == mm) spu_invalidate_slbs(spu); } - spin_unlock_irqrestore(&spu_list_lock, flags); + spin_unlock_irqrestore(&spu_full_list_lock, flags); } /* The hack below stinks... try to do something better one of @@ -88,9 +107,9 @@ void spu_associate_mm(struct spu *spu, struct mm_struct *mm) { unsigned long flags; - spin_lock_irqsave(&spu_list_lock, flags); + spin_lock_irqsave(&spu_full_list_lock, flags); spu->mm = mm; - spin_unlock_irqrestore(&spu_list_lock, flags); + spin_unlock_irqrestore(&spu_full_list_lock, flags); if (mm) mm_needs_global_tlbie(mm); } @@ -390,7 +409,7 @@ static void spu_free_irqs(struct spu *spu) free_irq(spu->irqs[2], spu); } -static void spu_init_channels(struct spu *spu) +void spu_init_channels(struct spu *spu) { static const struct { unsigned channel; @@ -423,46 +442,7 @@ static void spu_init_channels(struct spu *spu) out_be64(&priv2->spu_chnlcnt_RW, count_list[i].count); } } - -struct spu *spu_alloc_node(int node) -{ - struct spu *spu = NULL; - - mutex_lock(&spu_mutex); - if (!list_empty(&spu_list[node])) { - spu = list_entry(spu_list[node].next, struct spu, list); - list_del_init(&spu->list); - pr_debug("Got SPU %d %d\n", spu->number, spu->node); - } - mutex_unlock(&spu_mutex); - - if (spu) - spu_init_channels(spu); - return spu; -} -EXPORT_SYMBOL_GPL(spu_alloc_node); - -struct spu *spu_alloc(void) -{ - struct spu *spu = NULL; - int node; - - for (node = 0; node < MAX_NUMNODES; node++) { - spu = spu_alloc_node(node); - if (spu) - break; - } - - return spu; -} - -void spu_free(struct spu *spu) -{ - mutex_lock(&spu_mutex); - list_add_tail(&spu->list, &spu_list[spu->node]); - mutex_unlock(&spu_mutex); -} -EXPORT_SYMBOL_GPL(spu_free); +EXPORT_SYMBOL_GPL(spu_init_channels); static int spu_shutdown(struct sys_device *sysdev) { @@ -481,12 +461,12 @@ struct sysdev_class spu_sysdev_class = { int spu_add_sysdev_attr(struct sysdev_attribute *attr) { struct spu *spu; - mutex_lock(&spu_mutex); + mutex_lock(&spu_full_list_mutex); list_for_each_entry(spu, &spu_full_list, full_list) sysdev_create_file(&spu->sysdev, attr); + mutex_unlock(&spu_full_list_mutex); - mutex_unlock(&spu_mutex); return 0; } EXPORT_SYMBOL_GPL(spu_add_sysdev_attr); @@ -494,12 +474,12 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr); int spu_add_sysdev_attr_group(struct attribute_group *attrs) { struct spu *spu; - mutex_lock(&spu_mutex); + mutex_lock(&spu_full_list_mutex); list_for_each_entry(spu, &spu_full_list, full_list) sysfs_create_group(&spu->sysdev.kobj, attrs); + mutex_unlock(&spu_full_list_mutex); - mutex_unlock(&spu_mutex); return 0; } EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group); @@ -508,24 +488,22 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group); void spu_remove_sysdev_attr(struct sysdev_attribute *attr) { struct spu *spu; - mutex_lock(&spu_mutex); + mutex_lock(&spu_full_list_mutex); list_for_each_entry(spu, &spu_full_list, full_list) sysdev_remove_file(&spu->sysdev, attr); - - mutex_unlock(&spu_mutex); + mutex_unlock(&spu_full_list_mutex); } EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr); void spu_remove_sysdev_attr_group(struct attribute_group *attrs) { struct spu *spu; - mutex_lock(&spu_mutex); + mutex_lock(&spu_full_list_mutex); list_for_each_entry(spu, &spu_full_list, full_list) sysfs_remove_group(&spu->sysdev.kobj, attrs); - - mutex_unlock(&spu_mutex); + mutex_unlock(&spu_full_list_mutex); } EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group); @@ -553,16 +531,19 @@ static int __init create_spu(void *data) int ret; static int number; unsigned long flags; + struct timespec ts; ret = -ENOMEM; spu = kzalloc(sizeof (*spu), GFP_KERNEL); if (!spu) goto out; + spu->alloc_state = SPU_FREE; + spin_lock_init(&spu->register_lock); - mutex_lock(&spu_mutex); + spin_lock(&spu_lock); spu->number = number++; - mutex_unlock(&spu_mutex); + spin_unlock(&spu_lock); ret = spu_create_spu(spu, data); @@ -579,15 +560,22 @@ static int __init create_spu(void *data) if (ret) goto out_free_irqs; - mutex_lock(&spu_mutex); - spin_lock_irqsave(&spu_list_lock, flags); - list_add(&spu->list, &spu_list[spu->node]); + mutex_lock(&cbe_spu_info[spu->node].list_mutex); + list_add(&spu->cbe_list, &cbe_spu_info[spu->node].spus); + cbe_spu_info[spu->node].n_spus++; + mutex_unlock(&cbe_spu_info[spu->node].list_mutex); + + mutex_lock(&spu_full_list_mutex); + spin_lock_irqsave(&spu_full_list_lock, flags); list_add(&spu->full_list, &spu_full_list); - spin_unlock_irqrestore(&spu_list_lock, flags); - mutex_unlock(&spu_mutex); + spin_unlock_irqrestore(&spu_full_list_lock, flags); + mutex_unlock(&spu_full_list_mutex); + + spu->stats.util_state = SPU_UTIL_IDLE_LOADED; + ktime_get_ts(&ts); + spu->stats.tstamp = timespec_to_ns(&ts); - spu->stats.utilization_state = SPU_UTIL_IDLE; - spu->stats.tstamp = jiffies; + INIT_LIST_HEAD(&spu->aff_list); goto out; @@ -608,12 +596,20 @@ static const char *spu_state_names[] = { static unsigned long long spu_acct_time(struct spu *spu, enum spu_utilization_state state) { + struct timespec ts; unsigned long long time = spu->stats.times[state]; - if (spu->stats.utilization_state == state) - time += jiffies - spu->stats.tstamp; + /* + * If the spu is idle or the context is stopped, utilization + * statistics are not updated. Apply the time delta from the + * last recorded state of the spu. + */ + if (spu->stats.util_state == state) { + ktime_get_ts(&ts); + time += timespec_to_ns(&ts) - spu->stats.tstamp; + } - return jiffies_to_msecs(time); + return time / NSEC_PER_MSEC; } @@ -623,11 +619,11 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf) return sprintf(buf, "%s %llu %llu %llu %llu " "%llu %llu %llu %llu %llu %llu %llu %llu\n", - spu_state_names[spu->stats.utilization_state], + spu_state_names[spu->stats.util_state], spu_acct_time(spu, SPU_UTIL_USER), spu_acct_time(spu, SPU_UTIL_SYSTEM), spu_acct_time(spu, SPU_UTIL_IOWAIT), - spu_acct_time(spu, SPU_UTIL_IDLE), + spu_acct_time(spu, SPU_UTIL_IDLE_LOADED), spu->stats.vol_ctx_switch, spu->stats.invol_ctx_switch, spu->stats.slb_flt, @@ -640,12 +636,146 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf) static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL); +/* Hardcoded affinity idxs for QS20 */ +#define SPES_PER_BE 8 +static int QS20_reg_idxs[SPES_PER_BE] = { 0, 2, 4, 6, 7, 5, 3, 1 }; +static int QS20_reg_memory[SPES_PER_BE] = { 1, 1, 0, 0, 0, 0, 0, 0 }; + +static struct spu *spu_lookup_reg(int node, u32 reg) +{ + struct spu *spu; + + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { + if (*(u32 *)get_property(spu_devnode(spu), "reg", NULL) == reg) + return spu; + } + return NULL; +} + +static void init_aff_QS20_harcoded(void) +{ + int node, i; + struct spu *last_spu, *spu; + u32 reg; + + for (node = 0; node < MAX_NUMNODES; node++) { + last_spu = NULL; + for (i = 0; i < SPES_PER_BE; i++) { + reg = QS20_reg_idxs[i]; + spu = spu_lookup_reg(node, reg); + if (!spu) + continue; + spu->has_mem_affinity = QS20_reg_memory[reg]; + if (last_spu) + list_add_tail(&spu->aff_list, + &last_spu->aff_list); + last_spu = spu; + } + } +} + +static int of_has_vicinity(void) +{ + struct spu* spu; + + spu = list_entry(cbe_spu_info[0].spus.next, struct spu, cbe_list); + return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL; +} + +static struct spu *aff_devnode_spu(int cbe, struct device_node *dn) +{ + struct spu *spu; + + list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list) + if (spu_devnode(spu) == dn) + return spu; + return NULL; +} + +static struct spu * +aff_node_next_to(int cbe, struct device_node *target, struct device_node *avoid) +{ + struct spu *spu; + const phandle *vic_handles; + int lenp, i; + + list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list) { + if (spu_devnode(spu) == avoid) + continue; + vic_handles = get_property(spu_devnode(spu), "vicinity", &lenp); + for (i=0; i < (lenp / sizeof(phandle)); i++) { + if (vic_handles[i] == target->linux_phandle) + return spu; + } + } + return NULL; +} + +static void init_aff_fw_vicinity_node(int cbe) +{ + struct spu *spu, *last_spu; + struct device_node *vic_dn, *last_spu_dn; + phandle avoid_ph; + const phandle *vic_handles; + const char *name; + int lenp, i, added, mem_aff; + + last_spu = list_entry(cbe_spu_info[cbe].spus.next, struct spu, cbe_list); + avoid_ph = 0; + for (added = 1; added < cbe_spu_info[cbe].n_spus; added++) { + last_spu_dn = spu_devnode(last_spu); + vic_handles = get_property(last_spu_dn, "vicinity", &lenp); + + for (i = 0; i < (lenp / sizeof(phandle)); i++) { + if (vic_handles[i] == avoid_ph) + continue; + + vic_dn = of_find_node_by_phandle(vic_handles[i]); + if (!vic_dn) + continue; + + name = get_property(vic_dn, "name", NULL); + if (strcmp(name, "spe") == 0) { + spu = aff_devnode_spu(cbe, vic_dn); + avoid_ph = last_spu_dn->linux_phandle; + } + else { + mem_aff = strcmp(name, "mic-tm") == 0; + spu = aff_node_next_to(cbe, vic_dn, last_spu_dn); + if (!spu) + continue; + if (mem_aff) { + last_spu->has_mem_affinity = 1; + spu->has_mem_affinity = 1; + } + avoid_ph = vic_dn->linux_phandle; + } + list_add_tail(&spu->aff_list, &last_spu->aff_list); + last_spu = spu; + break; + } + } +} + +static void init_aff_fw_vicinity(void) +{ + int cbe; + + /* sets has_mem_affinity for each spu, as long as the + * spu->aff_list list, linking each spu to its neighbors + */ + for (cbe = 0; cbe < MAX_NUMNODES; cbe++) + init_aff_fw_vicinity_node(cbe); +} + static int __init init_spu_base(void) { int i, ret = 0; - for (i = 0; i < MAX_NUMNODES; i++) - INIT_LIST_HEAD(&spu_list[i]); + for (i = 0; i < MAX_NUMNODES; i++) { + mutex_init(&cbe_spu_info[i].list_mutex); + INIT_LIST_HEAD(&cbe_spu_info[i].spus); + } if (!spu_management_ops) goto out; @@ -675,16 +805,25 @@ static int __init init_spu_base(void) fb_append_extra_logo(&logo_spe_clut224, ret); } + mutex_lock(&spu_full_list_mutex); xmon_register_spus(&spu_full_list); - + crash_register_spus(&spu_full_list); + mutex_unlock(&spu_full_list_mutex); spu_add_sysdev_attr(&attr_stat); + if (of_has_vicinity()) { + init_aff_fw_vicinity(); + } else { + long root = of_get_flat_dt_root(); + if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0")) + init_aff_QS20_harcoded(); + } + return 0; out_unregister_sysdev_class: sysdev_class_unregister(&spu_sysdev_class); out: - return ret; } module_init(init_spu_base); diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c index 261b507..dd2c668 100644 --- a/arch/powerpc/platforms/cell/spu_syscalls.c +++ b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -34,14 +34,27 @@ struct spufs_calls spufs_calls = { * this file is not used and the syscalls directly enter the fs code */ asmlinkage long sys_spu_create(const char __user *name, - unsigned int flags, mode_t mode) + unsigned int flags, mode_t mode, int neighbor_fd) { long ret; struct module *owner = spufs_calls.owner; + struct file *neighbor; + int fput_needed; ret = -ENOSYS; if (owner && try_module_get(owner)) { - ret = spufs_calls.create_thread(name, flags, mode); + if (flags & SPU_CREATE_AFFINITY_SPU) { + neighbor = fget_light(neighbor_fd, &fput_needed); + if (neighbor) { + ret = spufs_calls.create_thread(name, flags, + mode, neighbor); + fput_light(neighbor, fput_needed); + } + } + else { + ret = spufs_calls.create_thread(name, flags, + mode, NULL); + } module_put(owner); } return ret; diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index 6d7bd60..6694f86 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/module.h> #include <linux/slab.h> #include <asm/atomic.h> #include <asm/spu.h> @@ -55,12 +56,12 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang) ctx->ops = &spu_backing_ops; ctx->owner = get_task_mm(current); INIT_LIST_HEAD(&ctx->rq); + INIT_LIST_HEAD(&ctx->aff_list); if (gang) spu_gang_add_ctx(gang, ctx); ctx->cpus_allowed = current->cpus_allowed; spu_set_timeslice(ctx); - ctx->stats.execution_state = SPUCTX_UTIL_USER; - ctx->stats.tstamp = jiffies; + ctx->stats.util_state = SPU_UTIL_IDLE_LOADED; atomic_inc(&nr_spu_contexts); goto out; @@ -81,6 +82,8 @@ void destroy_spu_context(struct kref *kref) spu_fini_csa(&ctx->csa); if (ctx->gang) spu_gang_remove_ctx(ctx->gang, ctx); + if (ctx->prof_priv_kref) + kref_put(ctx->prof_priv_kref, ctx->prof_priv_release); BUG_ON(!list_empty(&ctx->rq)); atomic_dec(&nr_spu_contexts); kfree(ctx); @@ -166,6 +169,39 @@ int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags) void spu_acquire_saved(struct spu_context *ctx) { spu_acquire(ctx); - if (ctx->state != SPU_STATE_SAVED) + if (ctx->state != SPU_STATE_SAVED) { + set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags); spu_deactivate(ctx); + } +} + +/** + * spu_release_saved - unlock spu context and return it to the runqueue + * @ctx: context to unlock + */ +void spu_release_saved(struct spu_context *ctx) +{ + BUG_ON(ctx->state != SPU_STATE_SAVED); + + if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags)) + spu_activate(ctx, 0); + + spu_release(ctx); } + +void spu_set_profile_private_kref(struct spu_context *ctx, + struct kref *prof_info_kref, + void ( * prof_info_release) (struct kref *kref)) +{ + ctx->prof_priv_kref = prof_info_kref; + ctx->prof_priv_release = prof_info_release; +} +EXPORT_SYMBOL_GPL(spu_set_profile_private_kref); + +void *spu_get_profile_private_kref(struct spu_context *ctx) +{ + return ctx->prof_priv_kref; +} +EXPORT_SYMBOL_GPL(spu_get_profile_private_kref); + + diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 5d9ad5a..5e31799 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -226,7 +226,7 @@ static void spufs_arch_write_notes(struct file *file) spu_acquire_saved(ctx_info->ctx); for (j = 0; j < spufs_coredump_num_notes; j++) spufs_arch_write_note(ctx_info, j, file); - spu_release(ctx_info->ctx); + spu_release_saved(ctx_info->ctx); list_del(&ctx_info->list); kfree(ctx_info); } diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index f53a074..917eab4 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -179,16 +179,14 @@ int spufs_handle_class1(struct spu_context *ctx) if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))) return 0; - spuctx_switch_state(ctx, SPUCTX_UTIL_IOWAIT); + spuctx_switch_state(ctx, SPU_UTIL_IOWAIT); pr_debug("ctx %p: ea %016lx, dsisr %016lx state %d\n", ctx, ea, dsisr, ctx->state); ctx->stats.hash_flt++; - if (ctx->state == SPU_STATE_RUNNABLE) { + if (ctx->state == SPU_STATE_RUNNABLE) ctx->spu->stats.hash_flt++; - spu_switch_state(ctx->spu, SPU_UTIL_IOWAIT); - } /* we must not hold the lock when entering spu_handle_mm_fault */ spu_release(ctx); @@ -226,7 +224,7 @@ int spufs_handle_class1(struct spu_context *ctx) } else spufs_handle_dma_error(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE); - spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); + spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); return ret; } EXPORT_SYMBOL_GPL(spufs_handle_class1); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index c2814ea..7de4e91 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -370,7 +370,7 @@ spufs_regs_read(struct file *file, char __user *buffer, spu_acquire_saved(ctx); ret = __spufs_regs_read(ctx, buffer, size, pos); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -392,7 +392,7 @@ spufs_regs_write(struct file *file, const char __user *buffer, ret = copy_from_user(lscsa->gprs + *pos - size, buffer, size) ? -EFAULT : size; - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -421,7 +421,7 @@ spufs_fpcr_read(struct file *file, char __user * buffer, spu_acquire_saved(ctx); ret = __spufs_fpcr_read(ctx, buffer, size, pos); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -443,7 +443,7 @@ spufs_fpcr_write(struct file *file, const char __user * buffer, ret = copy_from_user((char *)&lscsa->fpcr + *pos - size, buffer, size) ? -EFAULT : size; - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -868,7 +868,7 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf, spu_acquire_saved(ctx); ret = __spufs_signal1_read(ctx, buf, len, pos); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -934,6 +934,13 @@ static const struct file_operations spufs_signal1_fops = { .mmap = spufs_signal1_mmap, }; +static const struct file_operations spufs_signal1_nosched_fops = { + .open = spufs_signal1_open, + .release = spufs_signal1_release, + .write = spufs_signal1_write, + .mmap = spufs_signal1_mmap, +}; + static int spufs_signal2_open(struct inode *inode, struct file *file) { struct spufs_inode_info *i = SPUFS_I(inode); @@ -992,7 +999,7 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf, spu_acquire_saved(ctx); ret = __spufs_signal2_read(ctx, buf, len, pos); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -1062,6 +1069,13 @@ static const struct file_operations spufs_signal2_fops = { .mmap = spufs_signal2_mmap, }; +static const struct file_operations spufs_signal2_nosched_fops = { + .open = spufs_signal2_open, + .release = spufs_signal2_release, + .write = spufs_signal2_write, + .mmap = spufs_signal2_mmap, +}; + static void spufs_signal1_type_set(void *data, u64 val) { struct spu_context *ctx = data; @@ -1612,7 +1626,7 @@ static void spufs_decr_set(void *data, u64 val) struct spu_lscsa *lscsa = ctx->csa.lscsa; spu_acquire_saved(ctx); lscsa->decr.slot[0] = (u32) val; - spu_release(ctx); + spu_release_saved(ctx); } static u64 __spufs_decr_get(void *data) @@ -1628,7 +1642,7 @@ static u64 spufs_decr_get(void *data) u64 ret; spu_acquire_saved(ctx); ret = __spufs_decr_get(data); - spu_release(ctx); + spu_release_saved(ctx); return ret; } DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set, @@ -1637,17 +1651,21 @@ DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set, static void spufs_decr_status_set(void *data, u64 val) { struct spu_context *ctx = data; - struct spu_lscsa *lscsa = ctx->csa.lscsa; spu_acquire_saved(ctx); - lscsa->decr_status.slot[0] = (u32) val; - spu_release(ctx); + if (val) + ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING; + else + ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING; + spu_release_saved(ctx); } static u64 __spufs_decr_status_get(void *data) { struct spu_context *ctx = data; - struct spu_lscsa *lscsa = ctx->csa.lscsa; - return lscsa->decr_status.slot[0]; + if (ctx->csa.priv2.mfc_control_RW & MFC_CNTL_DECREMENTER_RUNNING) + return SPU_DECR_STATUS_RUNNING; + else + return 0; } static u64 spufs_decr_status_get(void *data) @@ -1656,7 +1674,7 @@ static u64 spufs_decr_status_get(void *data) u64 ret; spu_acquire_saved(ctx); ret = __spufs_decr_status_get(data); - spu_release(ctx); + spu_release_saved(ctx); return ret; } DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get, @@ -1668,7 +1686,7 @@ static void spufs_event_mask_set(void *data, u64 val) struct spu_lscsa *lscsa = ctx->csa.lscsa; spu_acquire_saved(ctx); lscsa->event_mask.slot[0] = (u32) val; - spu_release(ctx); + spu_release_saved(ctx); } static u64 __spufs_event_mask_get(void *data) @@ -1684,7 +1702,7 @@ static u64 spufs_event_mask_get(void *data) u64 ret; spu_acquire_saved(ctx); ret = __spufs_event_mask_get(data); - spu_release(ctx); + spu_release_saved(ctx); return ret; } DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get, @@ -1708,7 +1726,7 @@ static u64 spufs_event_status_get(void *data) spu_acquire_saved(ctx); ret = __spufs_event_status_get(data); - spu_release(ctx); + spu_release_saved(ctx); return ret; } DEFINE_SIMPLE_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get, @@ -1720,7 +1738,7 @@ static void spufs_srr0_set(void *data, u64 val) struct spu_lscsa *lscsa = ctx->csa.lscsa; spu_acquire_saved(ctx); lscsa->srr0.slot[0] = (u32) val; - spu_release(ctx); + spu_release_saved(ctx); } static u64 spufs_srr0_get(void *data) @@ -1730,7 +1748,7 @@ static u64 spufs_srr0_get(void *data) u64 ret; spu_acquire_saved(ctx); ret = lscsa->srr0.slot[0]; - spu_release(ctx); + spu_release_saved(ctx); return ret; } DEFINE_SIMPLE_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set, @@ -1786,7 +1804,7 @@ static u64 spufs_lslr_get(void *data) spu_acquire_saved(ctx); ret = __spufs_lslr_get(data); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -1850,7 +1868,7 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf, spin_lock(&ctx->csa.register_lock); ret = __spufs_mbox_info_read(ctx, buf, len, pos); spin_unlock(&ctx->csa.register_lock); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -1888,7 +1906,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf, spin_lock(&ctx->csa.register_lock); ret = __spufs_ibox_info_read(ctx, buf, len, pos); spin_unlock(&ctx->csa.register_lock); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -1929,7 +1947,7 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf, spin_lock(&ctx->csa.register_lock); ret = __spufs_wbox_info_read(ctx, buf, len, pos); spin_unlock(&ctx->csa.register_lock); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -1979,7 +1997,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf, spin_lock(&ctx->csa.register_lock); ret = __spufs_dma_info_read(ctx, buf, len, pos); spin_unlock(&ctx->csa.register_lock); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -2030,7 +2048,7 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf, spin_lock(&ctx->csa.register_lock); ret = __spufs_proxydma_info_read(ctx, buf, len, pos); spin_unlock(&ctx->csa.register_lock); - spu_release(ctx); + spu_release_saved(ctx); return ret; } @@ -2065,14 +2083,26 @@ static const char *ctx_state_names[] = { }; static unsigned long long spufs_acct_time(struct spu_context *ctx, - enum spuctx_execution_state state) + enum spu_utilization_state state) { - unsigned long time = ctx->stats.times[state]; + struct timespec ts; + unsigned long long time = ctx->stats.times[state]; - if (ctx->stats.execution_state == state) - time += jiffies - ctx->stats.tstamp; + /* + * In general, utilization statistics are updated by the controlling + * thread as the spu context moves through various well defined + * state transitions, but if the context is lazily loaded its + * utilization statistics are not updated as the controlling thread + * is not tightly coupled with the execution of the spu context. We + * calculate and apply the time delta from the last recorded state + * of the spu context. + */ + if (ctx->spu && ctx->stats.util_state == state) { + ktime_get_ts(&ts); + time += timespec_to_ns(&ts) - ctx->stats.tstamp; + } - return jiffies_to_msecs(time); + return time / NSEC_PER_MSEC; } static unsigned long long spufs_slb_flts(struct spu_context *ctx) @@ -2107,11 +2137,11 @@ static int spufs_show_stat(struct seq_file *s, void *private) spu_acquire(ctx); seq_printf(s, "%s %llu %llu %llu %llu " "%llu %llu %llu %llu %llu %llu %llu %llu\n", - ctx_state_names[ctx->stats.execution_state], - spufs_acct_time(ctx, SPUCTX_UTIL_USER), - spufs_acct_time(ctx, SPUCTX_UTIL_SYSTEM), - spufs_acct_time(ctx, SPUCTX_UTIL_IOWAIT), - spufs_acct_time(ctx, SPUCTX_UTIL_LOADED), + ctx_state_names[ctx->stats.util_state], + spufs_acct_time(ctx, SPU_UTIL_USER), + spufs_acct_time(ctx, SPU_UTIL_SYSTEM), + spufs_acct_time(ctx, SPU_UTIL_IOWAIT), + spufs_acct_time(ctx, SPU_UTIL_IDLE_LOADED), ctx->stats.vol_ctx_switch, ctx->stats.invol_ctx_switch, spufs_slb_flts(ctx), @@ -2184,8 +2214,8 @@ struct tree_descr spufs_dir_nosched_contents[] = { { "mbox_stat", &spufs_mbox_stat_fops, 0444, }, { "ibox_stat", &spufs_ibox_stat_fops, 0444, }, { "wbox_stat", &spufs_wbox_stat_fops, 0444, }, - { "signal1", &spufs_signal1_fops, 0666, }, - { "signal2", &spufs_signal2_fops, 0666, }, + { "signal1", &spufs_signal1_nosched_fops, 0222, }, + { "signal2", &spufs_signal2_nosched_fops, 0222, }, { "signal1_type", &spufs_signal1_type, 0666, }, { "signal2_type", &spufs_signal2_type, 0666, }, { "mss", &spufs_mss_fops, 0666, }, diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c index 212ea78..71a4432 100644 --- a/arch/powerpc/platforms/cell/spufs/gang.c +++ b/arch/powerpc/platforms/cell/spufs/gang.c @@ -35,7 +35,9 @@ struct spu_gang *alloc_spu_gang(void) kref_init(&gang->kref); mutex_init(&gang->mutex); + mutex_init(&gang->aff_mutex); INIT_LIST_HEAD(&gang->list); + INIT_LIST_HEAD(&gang->aff_list_head); out: return gang; @@ -73,6 +75,10 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx) { mutex_lock(&gang->mutex); WARN_ON(ctx->gang != gang); + if (!list_empty(&ctx->aff_list)) { + list_del_init(&ctx->aff_list); + gang->aff_flags &= ~AFF_OFFSETS_SET; + } list_del_init(&ctx->gang_list); gang->contexts--; mutex_unlock(&gang->mutex); diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 7eb4d6c..b3d0dd1 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -316,11 +316,107 @@ out: return ret; } -static int spufs_create_context(struct inode *inode, - struct dentry *dentry, - struct vfsmount *mnt, int flags, int mode) +static struct spu_context * +spufs_assert_affinity(unsigned int flags, struct spu_gang *gang, + struct file *filp) +{ + struct spu_context *tmp, *neighbor; + int count, node; + int aff_supp; + + aff_supp = !list_empty(&(list_entry(cbe_spu_info[0].spus.next, + struct spu, cbe_list))->aff_list); + + if (!aff_supp) + return ERR_PTR(-EINVAL); + + if (flags & SPU_CREATE_GANG) + return ERR_PTR(-EINVAL); + + if (flags & SPU_CREATE_AFFINITY_MEM && + gang->aff_ref_ctx && + gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM) + return ERR_PTR(-EEXIST); + + if (gang->aff_flags & AFF_MERGED) + return ERR_PTR(-EBUSY); + + neighbor = NULL; + if (flags & SPU_CREATE_AFFINITY_SPU) { + if (!filp || filp->f_op != &spufs_context_fops) + return ERR_PTR(-EINVAL); + + neighbor = get_spu_context( + SPUFS_I(filp->f_dentry->d_inode)->i_ctx); + + if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) && + !list_is_last(&neighbor->aff_list, &gang->aff_list_head) && + !list_entry(neighbor->aff_list.next, struct spu_context, + aff_list)->aff_head) + return ERR_PTR(-EEXIST); + + if (gang != neighbor->gang) + return ERR_PTR(-EINVAL); + + count = 1; + list_for_each_entry(tmp, &gang->aff_list_head, aff_list) + count++; + if (list_empty(&neighbor->aff_list)) + count++; + + for (node = 0; node < MAX_NUMNODES; node++) { + if ((cbe_spu_info[node].n_spus - atomic_read( + &cbe_spu_info[node].reserved_spus)) >= count) + break; + } + + if (node == MAX_NUMNODES) + return ERR_PTR(-EEXIST); + } + + return neighbor; +} + +static void +spufs_set_affinity(unsigned int flags, struct spu_context *ctx, + struct spu_context *neighbor) +{ + if (flags & SPU_CREATE_AFFINITY_MEM) + ctx->gang->aff_ref_ctx = ctx; + + if (flags & SPU_CREATE_AFFINITY_SPU) { + if (list_empty(&neighbor->aff_list)) { + list_add_tail(&neighbor->aff_list, + &ctx->gang->aff_list_head); + neighbor->aff_head = 1; + } + + if (list_is_last(&neighbor->aff_list, &ctx->gang->aff_list_head) + || list_entry(neighbor->aff_list.next, struct spu_context, + aff_list)->aff_head) { + list_add(&ctx->aff_list, &neighbor->aff_list); + } else { + list_add_tail(&ctx->aff_list, &neighbor->aff_list); + if (neighbor->aff_head) { + neighbor->aff_head = 0; + ctx->aff_head = 1; + } + } + + if (!ctx->gang->aff_ref_ctx) + ctx->gang->aff_ref_ctx = ctx; + } +} + +static int +spufs_create_context(struct inode *inode, struct dentry *dentry, + struct vfsmount *mnt, int flags, int mode, + struct file *aff_filp) { int ret; + int affinity; + struct spu_gang *gang; + struct spu_context *neighbor; ret = -EPERM; if ((flags & SPU_CREATE_NOSCHED) && @@ -336,9 +432,29 @@ static int spufs_create_context(struct inode *inode, if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader) goto out_unlock; + gang = NULL; + neighbor = NULL; + affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU); + if (affinity) { + gang = SPUFS_I(inode)->i_gang; + ret = -EINVAL; + if (!gang) + goto out_unlock; + mutex_lock(&gang->aff_mutex); + neighbor = spufs_assert_affinity(flags, gang, aff_filp); + if (IS_ERR(neighbor)) { + ret = PTR_ERR(neighbor); + goto out_aff_unlock; + } + } + ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO); if (ret) - goto out_unlock; + goto out_aff_unlock; + + if (affinity) + spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx, + neighbor); /* * get references for dget and mntget, will be released @@ -352,6 +468,9 @@ static int spufs_create_context(struct inode *inode, goto out; } +out_aff_unlock: + if (affinity) + mutex_unlock(&gang->aff_mutex); out_unlock: mutex_unlock(&inode->i_mutex); out: @@ -450,7 +569,8 @@ out: static struct file_system_type spufs_type; -long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode) +long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode, + struct file *filp) { struct dentry *dentry; int ret; @@ -487,7 +607,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode) dentry, nd->mnt, mode); else return spufs_create_context(nd->dentry->d_inode, - dentry, nd->mnt, flags, mode); + dentry, nd->mnt, flags, mode, filp); out_dput: dput(dentry); diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c index 58ae13b..0b50fa5 100644 --- a/arch/powerpc/platforms/cell/spufs/run.c +++ b/arch/powerpc/platforms/cell/spufs/run.c @@ -18,15 +18,17 @@ void spufs_stop_callback(struct spu *spu) wake_up_all(&ctx->stop_wq); } -static inline int spu_stopped(struct spu_context *ctx, u32 * stat) +static inline int spu_stopped(struct spu_context *ctx, u32 *stat) { struct spu *spu; u64 pte_fault; *stat = ctx->ops->status_read(ctx); - if (ctx->state != SPU_STATE_RUNNABLE) - return 1; + spu = ctx->spu; + if (ctx->state != SPU_STATE_RUNNABLE || + test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags)) + return 1; pte_fault = spu->dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED); return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending) ? @@ -124,8 +126,10 @@ out: return ret; } -static int spu_run_init(struct spu_context *ctx, u32 * npc) +static int spu_run_init(struct spu_context *ctx, u32 *npc) { + spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); + if (ctx->flags & SPU_CREATE_ISOLATE) { unsigned long runcntl; @@ -151,16 +155,20 @@ static int spu_run_init(struct spu_context *ctx, u32 * npc) ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE); } + spuctx_switch_state(ctx, SPU_UTIL_USER); + return 0; } -static int spu_run_fini(struct spu_context *ctx, u32 * npc, - u32 * status) +static int spu_run_fini(struct spu_context *ctx, u32 *npc, + u32 *status) { int ret = 0; *status = ctx->ops->status_read(ctx); *npc = ctx->ops->npc_read(ctx); + + spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); spu_release(ctx); if (signal_pending(current)) @@ -289,10 +297,10 @@ static inline int spu_process_events(struct spu_context *ctx) return ret; } -long spufs_run_spu(struct file *file, struct spu_context *ctx, - u32 *npc, u32 *event) +long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event) { int ret; + struct spu *spu; u32 status; if (mutex_lock_interruptible(&ctx->run_mutex)) @@ -328,6 +336,17 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx, ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status)); if (unlikely(ret)) break; + spu = ctx->spu; + if (unlikely(test_and_clear_bit(SPU_SCHED_NOTIFY_ACTIVE, + &ctx->sched_flags))) { + if (!(status & SPU_STATUS_STOPPED_BY_STOP)) { + spu_switch_notify(spu, ctx); + continue; + } + } + + spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); + if ((status & SPU_STATUS_STOPPED_BY_STOP) && (status >> SPU_STOP_STATUS_SHIFT == 0x2104)) { ret = spu_process_callback(ctx); @@ -356,6 +375,7 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx, (ctx->state == SPU_STATE_RUNNABLE)) ctx->stats.libassist++; + ctx->ops->master_stop(ctx); ret = spu_run_fini(ctx, npc, &status); spu_yield(ctx); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index e5b4dd1..227968b 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -51,9 +51,6 @@ struct spu_prio_array { DECLARE_BITMAP(bitmap, MAX_PRIO); struct list_head runq[MAX_PRIO]; spinlock_t runq_lock; - struct list_head active_list[MAX_NUMNODES]; - struct mutex active_mutex[MAX_NUMNODES]; - int nr_active[MAX_NUMNODES]; int nr_waiting; }; @@ -127,7 +124,7 @@ void __spu_update_sched_info(struct spu_context *ctx) ctx->policy = current->policy; /* - * A lot of places that don't hold active_mutex poke into + * A lot of places that don't hold list_mutex poke into * cpus_allowed, including grab_runnable_context which * already holds the runq_lock. So abuse runq_lock * to protect this field aswell. @@ -141,9 +138,9 @@ void spu_update_sched_info(struct spu_context *ctx) { int node = ctx->spu->node; - mutex_lock(&spu_prio->active_mutex[node]); + mutex_lock(&cbe_spu_info[node].list_mutex); __spu_update_sched_info(ctx); - mutex_unlock(&spu_prio->active_mutex[node]); + mutex_unlock(&cbe_spu_info[node].list_mutex); } static int __node_allowed(struct spu_context *ctx, int node) @@ -169,56 +166,56 @@ static int node_allowed(struct spu_context *ctx, int node) return rval; } -/** - * spu_add_to_active_list - add spu to active list - * @spu: spu to add to the active list - */ -static void spu_add_to_active_list(struct spu *spu) -{ - int node = spu->node; - - mutex_lock(&spu_prio->active_mutex[node]); - spu_prio->nr_active[node]++; - list_add_tail(&spu->list, &spu_prio->active_list[node]); - mutex_unlock(&spu_prio->active_mutex[node]); -} +static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier); -static void __spu_remove_from_active_list(struct spu *spu) +void spu_switch_notify(struct spu *spu, struct spu_context *ctx) { - list_del_init(&spu->list); - spu_prio->nr_active[spu->node]--; + blocking_notifier_call_chain(&spu_switch_notifier, + ctx ? ctx->object_id : 0, spu); } -/** - * spu_remove_from_active_list - remove spu from active list - * @spu: spu to remove from the active list - */ -static void spu_remove_from_active_list(struct spu *spu) +static void notify_spus_active(void) { - int node = spu->node; - - mutex_lock(&spu_prio->active_mutex[node]); - __spu_remove_from_active_list(spu); - mutex_unlock(&spu_prio->active_mutex[node]); -} + int node; -static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier); + /* + * Wake up the active spu_contexts. + * + * When the awakened processes see their "notify_active" flag is set, + * they will call spu_switch_notify(); + */ + for_each_online_node(node) { + struct spu *spu; -static void spu_switch_notify(struct spu *spu, struct spu_context *ctx) -{ - blocking_notifier_call_chain(&spu_switch_notifier, - ctx ? ctx->object_id : 0, spu); + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { + if (spu->alloc_state != SPU_FREE) { + struct spu_context *ctx = spu->ctx; + set_bit(SPU_SCHED_NOTIFY_ACTIVE, + &ctx->sched_flags); + mb(); + wake_up_all(&ctx->stop_wq); + } + } + mutex_unlock(&cbe_spu_info[node].list_mutex); + } } int spu_switch_event_register(struct notifier_block * n) { - return blocking_notifier_chain_register(&spu_switch_notifier, n); + int ret; + ret = blocking_notifier_chain_register(&spu_switch_notifier, n); + if (!ret) + notify_spus_active(); + return ret; } +EXPORT_SYMBOL_GPL(spu_switch_event_register); int spu_switch_event_unregister(struct notifier_block * n) { return blocking_notifier_chain_unregister(&spu_switch_notifier, n); } +EXPORT_SYMBOL_GPL(spu_switch_event_unregister); /** * spu_bind_context - bind spu context to physical spu @@ -229,6 +226,12 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx) { pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid, spu->number, spu->node); + spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); + + if (ctx->flags & SPU_CREATE_NOSCHED) + atomic_inc(&cbe_spu_info[spu->node].reserved_spus); + if (!list_empty(&ctx->aff_list)) + atomic_inc(&ctx->gang->aff_sched_count); ctx->stats.slb_flt_base = spu->stats.slb_flt; ctx->stats.class2_intr_base = spu->stats.class2_intr; @@ -238,6 +241,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx) ctx->spu = spu; ctx->ops = &spu_hw_ops; spu->pid = current->pid; + spu->tgid = current->tgid; spu_associate_mm(spu, ctx->owner); spu->ibox_callback = spufs_ibox_callback; spu->wbox_callback = spufs_wbox_callback; @@ -251,7 +255,153 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx) spu_cpu_affinity_set(spu, raw_smp_processor_id()); spu_switch_notify(spu, ctx); ctx->state = SPU_STATE_RUNNABLE; - spu_switch_state(spu, SPU_UTIL_SYSTEM); + + spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); +} + +/* + * Must be used with the list_mutex held. + */ +static inline int sched_spu(struct spu *spu) +{ + BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex)); + + return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED)); +} + +static void aff_merge_remaining_ctxs(struct spu_gang *gang) +{ + struct spu_context *ctx; + + list_for_each_entry(ctx, &gang->aff_list_head, aff_list) { + if (list_empty(&ctx->aff_list)) + list_add(&ctx->aff_list, &gang->aff_list_head); + } + gang->aff_flags |= AFF_MERGED; +} + +static void aff_set_offsets(struct spu_gang *gang) +{ + struct spu_context *ctx; + int offset; + + offset = -1; + list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list, + aff_list) { + if (&ctx->aff_list == &gang->aff_list_head) + break; + ctx->aff_offset = offset--; + } + + offset = 0; + list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) { + if (&ctx->aff_list == &gang->aff_list_head) + break; + ctx->aff_offset = offset++; + } + + gang->aff_flags |= AFF_OFFSETS_SET; +} + +static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff, + int group_size, int lowest_offset) +{ + struct spu *spu; + int node, n; + + /* + * TODO: A better algorithm could be used to find a good spu to be + * used as reference location for the ctxs chain. + */ + node = cpu_to_node(raw_smp_processor_id()); + for (n = 0; n < MAX_NUMNODES; n++, node++) { + node = (node < MAX_NUMNODES) ? node : 0; + if (!node_allowed(ctx, node)) + continue; + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { + if ((!mem_aff || spu->has_mem_affinity) && + sched_spu(spu)) { + mutex_unlock(&cbe_spu_info[node].list_mutex); + return spu; + } + } + mutex_unlock(&cbe_spu_info[node].list_mutex); + } + return NULL; +} + +static void aff_set_ref_point_location(struct spu_gang *gang) +{ + int mem_aff, gs, lowest_offset; + struct spu_context *ctx; + struct spu *tmp; + + mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM; + lowest_offset = 0; + gs = 0; + + list_for_each_entry(tmp, &gang->aff_list_head, aff_list) + gs++; + + list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list, + aff_list) { + if (&ctx->aff_list == &gang->aff_list_head) + break; + lowest_offset = ctx->aff_offset; + } + + gang->aff_ref_spu = aff_ref_location(ctx, mem_aff, gs, lowest_offset); +} + +static struct spu *ctx_location(struct spu *ref, int offset, int node) +{ + struct spu *spu; + + spu = NULL; + if (offset >= 0) { + list_for_each_entry(spu, ref->aff_list.prev, aff_list) { + BUG_ON(spu->node != node); + if (offset == 0) + break; + if (sched_spu(spu)) + offset--; + } + } else { + list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) { + BUG_ON(spu->node != node); + if (offset == 0) + break; + if (sched_spu(spu)) + offset++; + } + } + + return spu; +} + +/* + * affinity_check is called each time a context is going to be scheduled. + * It returns the spu ptr on which the context must run. + */ +static int has_affinity(struct spu_context *ctx) +{ + struct spu_gang *gang = ctx->gang; + + if (list_empty(&ctx->aff_list)) + return 0; + + mutex_lock(&gang->aff_mutex); + if (!gang->aff_ref_spu) { + if (!(gang->aff_flags & AFF_MERGED)) + aff_merge_remaining_ctxs(gang); + if (!(gang->aff_flags & AFF_OFFSETS_SET)) + aff_set_offsets(gang); + aff_set_ref_point_location(gang); + } + mutex_unlock(&gang->aff_mutex); + + return gang->aff_ref_spu != NULL; } /** @@ -263,9 +413,13 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx) { pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__, spu->pid, spu->number, spu->node); + spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); - spu_switch_state(spu, SPU_UTIL_IDLE); - + if (spu->ctx->flags & SPU_CREATE_NOSCHED) + atomic_dec(&cbe_spu_info[spu->node].reserved_spus); + if (!list_empty(&ctx->aff_list)) + if (atomic_dec_and_test(&ctx->gang->aff_sched_count)) + ctx->gang->aff_ref_spu = NULL; spu_switch_notify(spu, NULL); spu_unmap_mappings(ctx); spu_save(&ctx->csa, spu); @@ -278,8 +432,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx) spu->dma_callback = NULL; spu_associate_mm(spu, NULL); spu->pid = 0; + spu->tgid = 0; ctx->ops = &spu_backing_ops; - ctx->spu = NULL; spu->flags = 0; spu->ctx = NULL; @@ -287,6 +441,10 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx) (spu->stats.slb_flt - ctx->stats.slb_flt_base); ctx->stats.class2_intr += (spu->stats.class2_intr - ctx->stats.class2_intr_base); + + /* This maps the underlying spu state to idle */ + spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); + ctx->spu = NULL; } /** @@ -352,18 +510,41 @@ static void spu_prio_wait(struct spu_context *ctx) static struct spu *spu_get_idle(struct spu_context *ctx) { - struct spu *spu = NULL; - int node = cpu_to_node(raw_smp_processor_id()); - int n; + struct spu *spu; + int node, n; + + if (has_affinity(ctx)) { + node = ctx->gang->aff_ref_spu->node; + mutex_lock(&cbe_spu_info[node].list_mutex); + spu = ctx_location(ctx->gang->aff_ref_spu, ctx->aff_offset, node); + if (spu && spu->alloc_state == SPU_FREE) + goto found; + mutex_unlock(&cbe_spu_info[node].list_mutex); + return NULL; + } + + node = cpu_to_node(raw_smp_processor_id()); for (n = 0; n < MAX_NUMNODES; n++, node++) { node = (node < MAX_NUMNODES) ? node : 0; if (!node_allowed(ctx, node)) continue; - spu = spu_alloc_node(node); - if (spu) - break; + + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { + if (spu->alloc_state == SPU_FREE) + goto found; + } + mutex_unlock(&cbe_spu_info[node].list_mutex); } + + return NULL; + + found: + spu->alloc_state = SPU_USED; + mutex_unlock(&cbe_spu_info[node].list_mutex); + pr_debug("Got SPU %d %d\n", spu->number, spu->node); + spu_init_channels(spu); return spu; } @@ -393,15 +574,15 @@ static struct spu *find_victim(struct spu_context *ctx) if (!node_allowed(ctx, node)) continue; - mutex_lock(&spu_prio->active_mutex[node]); - list_for_each_entry(spu, &spu_prio->active_list[node], list) { + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { struct spu_context *tmp = spu->ctx; if (tmp->prio > ctx->prio && (!victim || tmp->prio > victim->prio)) victim = spu->ctx; } - mutex_unlock(&spu_prio->active_mutex[node]); + mutex_unlock(&cbe_spu_info[node].list_mutex); if (victim) { /* @@ -426,7 +607,11 @@ static struct spu *find_victim(struct spu_context *ctx) victim = NULL; goto restart; } - spu_remove_from_active_list(spu); + + mutex_lock(&cbe_spu_info[node].list_mutex); + cbe_spu_info[node].nr_active--; + mutex_unlock(&cbe_spu_info[node].list_mutex); + spu_unbind_context(spu, victim); victim->stats.invol_ctx_switch++; spu->stats.invol_ctx_switch++; @@ -455,8 +640,6 @@ static struct spu *find_victim(struct spu_context *ctx) */ int spu_activate(struct spu_context *ctx, unsigned long flags) { - spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); - do { struct spu *spu; @@ -477,8 +660,12 @@ int spu_activate(struct spu_context *ctx, unsigned long flags) if (!spu && rt_prio(ctx->prio)) spu = find_victim(ctx); if (spu) { + int node = spu->node; + + mutex_lock(&cbe_spu_info[node].list_mutex); spu_bind_context(spu, ctx); - spu_add_to_active_list(spu); + cbe_spu_info[node].nr_active++; + mutex_unlock(&cbe_spu_info[node].list_mutex); return 0; } @@ -500,7 +687,7 @@ static struct spu_context *grab_runnable_context(int prio, int node) int best; spin_lock(&spu_prio->runq_lock); - best = sched_find_first_bit(spu_prio->bitmap); + best = find_first_bit(spu_prio->bitmap, prio); while (best < prio) { struct list_head *rq = &spu_prio->runq[best]; @@ -527,11 +714,17 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) if (spu) { new = grab_runnable_context(max_prio, spu->node); if (new || force) { - spu_remove_from_active_list(spu); + int node = spu->node; + + mutex_lock(&cbe_spu_info[node].list_mutex); spu_unbind_context(spu, ctx); + spu->alloc_state = SPU_FREE; + cbe_spu_info[node].nr_active--; + mutex_unlock(&cbe_spu_info[node].list_mutex); + ctx->stats.vol_ctx_switch++; spu->stats.vol_ctx_switch++; - spu_free(spu); + if (new) wake_up(&new->stop_wq); } @@ -550,21 +743,11 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) */ void spu_deactivate(struct spu_context *ctx) { - /* - * We must never reach this for a nosched context, - * but handle the case gracefull instead of panicing. - */ - if (ctx->flags & SPU_CREATE_NOSCHED) { - WARN_ON(1); - return; - } - __spu_deactivate(ctx, 1, MAX_PRIO); - spuctx_switch_state(ctx, SPUCTX_UTIL_USER); } /** - * spu_yield - yield a physical spu if others are waiting + * spu_yield - yield a physical spu if others are waiting * @ctx: spu context to yield * * Check if there is a higher priority context waiting and if yes @@ -575,17 +758,12 @@ void spu_yield(struct spu_context *ctx) { if (!(ctx->flags & SPU_CREATE_NOSCHED)) { mutex_lock(&ctx->state_mutex); - if (__spu_deactivate(ctx, 0, MAX_PRIO)) - spuctx_switch_state(ctx, SPUCTX_UTIL_USER); - else { - spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED); - spu_switch_state(ctx->spu, SPU_UTIL_USER); - } + __spu_deactivate(ctx, 0, MAX_PRIO); mutex_unlock(&ctx->state_mutex); } } -static void spusched_tick(struct spu_context *ctx) +static noinline void spusched_tick(struct spu_context *ctx) { if (ctx->flags & SPU_CREATE_NOSCHED) return; @@ -596,7 +774,7 @@ static void spusched_tick(struct spu_context *ctx) return; /* - * Unfortunately active_mutex ranks outside of state_mutex, so + * Unfortunately list_mutex ranks outside of state_mutex, so * we have to trylock here. If we fail give the context another * tick and try again. */ @@ -606,12 +784,11 @@ static void spusched_tick(struct spu_context *ctx) new = grab_runnable_context(ctx->prio + 1, spu->node); if (new) { - - __spu_remove_from_active_list(spu); spu_unbind_context(spu, ctx); ctx->stats.invol_ctx_switch++; spu->stats.invol_ctx_switch++; - spu_free(spu); + spu->alloc_state = SPU_FREE; + cbe_spu_info[spu->node].nr_active--; wake_up(&new->stop_wq); /* * We need to break out of the wait loop in @@ -632,7 +809,7 @@ static void spusched_tick(struct spu_context *ctx) * * Return the number of tasks currently running or waiting to run. * - * Note that we don't take runq_lock / active_mutex here. Reading + * Note that we don't take runq_lock / list_mutex here. Reading * a single 32bit value is atomic on powerpc, and we don't care * about memory ordering issues here. */ @@ -641,7 +818,7 @@ static unsigned long count_active_contexts(void) int nr_active = 0, node; for (node = 0; node < MAX_NUMNODES; node++) - nr_active += spu_prio->nr_active[node]; + nr_active += cbe_spu_info[node].nr_active; nr_active += spu_prio->nr_waiting; return nr_active; @@ -681,19 +858,18 @@ static void spusched_wake(unsigned long data) static int spusched_thread(void *unused) { - struct spu *spu, *next; + struct spu *spu; int node; while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule(); for (node = 0; node < MAX_NUMNODES; node++) { - mutex_lock(&spu_prio->active_mutex[node]); - list_for_each_entry_safe(spu, next, - &spu_prio->active_list[node], - list) - spusched_tick(spu->ctx); - mutex_unlock(&spu_prio->active_mutex[node]); + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) + if (spu->ctx) + spusched_tick(spu->ctx); + mutex_unlock(&cbe_spu_info[node].list_mutex); } } @@ -751,10 +927,9 @@ int __init spu_sched_init(void) INIT_LIST_HEAD(&spu_prio->runq[i]); __clear_bit(i, spu_prio->bitmap); } - __set_bit(MAX_PRIO, spu_prio->bitmap); for (i = 0; i < MAX_NUMNODES; i++) { - mutex_init(&spu_prio->active_mutex[i]); - INIT_LIST_HEAD(&spu_prio->active_list[i]); + mutex_init(&cbe_spu_info[i].list_mutex); + INIT_LIST_HEAD(&cbe_spu_info[i].spus); } spin_lock_init(&spu_prio->runq_lock); @@ -783,9 +958,9 @@ int __init spu_sched_init(void) return err; } -void __exit spu_sched_exit(void) +void spu_sched_exit(void) { - struct spu *spu, *tmp; + struct spu *spu; int node; remove_proc_entry("spu_loadavg", NULL); @@ -794,13 +969,11 @@ void __exit spu_sched_exit(void) kthread_stop(spusched_task); for (node = 0; node < MAX_NUMNODES; node++) { - mutex_lock(&spu_prio->active_mutex[node]); - list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node], - list) { - list_del_init(&spu->list); - spu_free(spu); - } - mutex_unlock(&spu_prio->active_mutex[node]); + mutex_lock(&cbe_spu_info[node].list_mutex); + list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) + if (spu->alloc_state != SPU_FREE) + spu->alloc_state = SPU_FREE; + mutex_unlock(&cbe_spu_info[node].list_mutex); } kfree(spu_prio); } diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore.c b/arch/powerpc/platforms/cell/spufs/spu_restore.c index 4e19ed7..21a9c95 100644 --- a/arch/powerpc/platforms/cell/spufs/spu_restore.c +++ b/arch/powerpc/platforms/cell/spufs/spu_restore.c @@ -84,13 +84,13 @@ static inline void restore_decr(void) unsigned int decr_running; unsigned int decr; - /* Restore, Step 6: + /* Restore, Step 6(moved): * If the LSCSA "decrementer running" flag is set * then write the SPU_WrDec channel with the * decrementer value from LSCSA. */ offset = LSCSA_QW_OFFSET(decr_status); - decr_running = regs_spill[offset].slot[0]; + decr_running = regs_spill[offset].slot[0] & SPU_DECR_STATUS_RUNNING; if (decr_running) { offset = LSCSA_QW_OFFSET(decr); decr = regs_spill[offset].slot[0]; @@ -318,10 +318,10 @@ int main() build_dma_list(lscsa_ea); /* Step 3. */ restore_upper_240kb(lscsa_ea); /* Step 4. */ /* Step 5: done by 'exit'. */ - restore_decr(); /* Step 6. */ enqueue_putllc(lscsa_ea); /* Step 7. */ set_tag_update(); /* Step 8. */ read_tag_status(); /* Step 9. */ + restore_decr(); /* moved Step 6. */ read_llar_status(); /* Step 10. */ write_ppu_mb(); /* Step 11. */ write_ppuint_mb(); /* Step 12. */ diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped index 15183d2..f383b02 100644 --- a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped +++ b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped @@ -10,7 +10,7 @@ static unsigned int spu_restore_code[] __attribute__((__aligned__(128))) = { 0x24fd8081, 0x1cd80081, 0x33001180, -0x42030003, +0x42034003, 0x33800284, 0x1c010204, 0x40200000, @@ -24,22 +24,22 @@ static unsigned int spu_restore_code[] __attribute__((__aligned__(128))) = { 0x23fffd84, 0x1c100183, 0x217ffa85, -0x3080a000, -0x3080a201, -0x3080a402, -0x3080a603, -0x3080a804, -0x3080aa05, -0x3080ac06, -0x3080ae07, -0x3080b008, -0x3080b209, -0x3080b40a, -0x3080b60b, -0x3080b80c, -0x3080ba0d, -0x3080bc0e, -0x3080be0f, +0x3080b000, +0x3080b201, +0x3080b402, +0x3080b603, +0x3080b804, +0x3080ba05, +0x3080bc06, +0x3080be07, +0x3080c008, +0x3080c209, +0x3080c40a, +0x3080c60b, +0x3080c80c, +0x3080ca0d, +0x3080cc0e, +0x3080ce0f, 0x00003ffc, 0x00000000, 0x00000000, @@ -48,19 +48,18 @@ static unsigned int spu_restore_code[] __attribute__((__aligned__(128))) = { 0x3ec00083, 0xb0a14103, 0x01a00204, -0x3ec10082, -0x4202800e, -0x04000703, -0xb0a14202, -0x21a00803, -0x3fbf028d, -0x3f20068d, -0x3fbe0682, +0x3ec10083, +0x4202c002, +0xb0a14203, +0x21a00802, +0x3fbf028a, +0x3f20050a, +0x3fbe0502, 0x3fe30102, 0x21a00882, -0x3f82028f, -0x3fe3078f, -0x3fbf0784, +0x3f82028b, +0x3fe3058b, +0x3fbf0584, 0x3f200204, 0x3fbe0204, 0x3fe30204, @@ -75,252 +74,285 @@ static unsigned int spu_restore_code[] __attribute__((__aligned__(128))) = { 0x21a00083, 0x40800082, 0x21a00b02, -0x10002818, -0x42a00002, -0x32800007, -0x4207000c, -0x18008208, -0x40a0000b, -0x4080020a, -0x40800709, -0x00200000, -0x42070002, -0x3ac30384, +0x10002612, +0x42a00003, +0x42074006, +0x1800c204, +0x40a00008, +0x40800789, +0x1c010305, +0x34000302, 0x1cffc489, -0x00200000, -0x18008383, -0x38830382, -0x4cffc486, -0x3ac28185, -0xb0408584, -0x28830382, -0x1c020387, -0x38828182, -0xb0408405, -0x1802c408, -0x28828182, -0x217ff886, -0x04000583, -0x21a00803, -0x3fbe0682, -0x3fe30102, -0x04000106, -0x21a00886, -0x04000603, -0x21a00903, -0x40803c02, -0x21a00982, -0x40800003, -0x04000184, -0x21a00a04, +0x3ec00303, +0x3ec00287, +0xb0408403, +0x24000302, +0x34000282, +0x1c020306, +0xb0408207, +0x18020204, +0x24000282, +0x217ffa09, +0x04000402, +0x21a00802, +0x3fbe0504, +0x3fe30204, +0x21a00884, +0x42074002, +0x21a00902, +0x40803c03, +0x21a00983, +0x04000485, +0x21a00a05, 0x40802202, 0x21a00a82, -0x42028005, -0x34208702, -0x21002282, -0x21a00804, -0x21a00886, -0x3fbf0782, +0x21a00805, +0x21a00884, +0x3fbf0582, 0x3f200102, 0x3fbe0102, 0x3fe30102, 0x21a00902, 0x40804003, 0x21a00983, -0x21a00a04, +0x21a00a05, 0x40805a02, 0x21a00a82, 0x40800083, 0x21a00b83, 0x01a00c02, -0x01a00d83, -0x3420c282, +0x30809c03, +0x34000182, +0x14004102, +0x21002082, +0x01a00d82, +0x3080a003, +0x34000182, 0x21a00e02, -0x34210283, -0x21a00f03, -0x34200284, -0x77400200, -0x3421c282, +0x3080a203, +0x34000182, +0x21a00f02, +0x3080a403, +0x34000182, +0x77400100, +0x3080a603, +0x34000182, 0x21a00702, -0x34218283, -0x21a00083, -0x34214282, +0x3080a803, +0x34000182, +0x21a00082, +0x3080aa03, +0x34000182, 0x21a00b02, -0x4200480c, -0x00200000, -0x1c010286, -0x34220284, -0x34220302, -0x0f608203, -0x5c024204, -0x3b81810b, -0x42013c02, -0x00200000, -0x18008185, -0x38808183, -0x3b814182, -0x21004e84, +0x4020007f, +0x3080ae02, +0x42004805, +0x3080ac04, +0x34000103, +0x34000202, +0x1cffc183, +0x3b810106, +0x0f608184, +0x42013802, +0x5c020183, +0x38810102, +0x3b810102, +0x21000e83, 0x4020007f, 0x35000100, -0x000004e0, -0x000002a0, -0x000002e8, -0x00000428, +0x00000470, +0x000002f8, +0x00000430, 0x00000360, -0x000002e8, -0x000004a0, -0x00000468, +0x000002f8, 0x000003c8, +0x000004a8, +0x00000298, 0x00000360, +0x00200000, 0x409ffe02, 0x30801203, -0x40800204, -0x3ec40085, -0x10009c09, -0x3ac10606, -0xb060c105, -0x4020007f, -0x4020007f, +0x40800208, +0x3ec40084, +0x40800407, +0x3ac20289, +0xb060c104, +0x3ac1c284, 0x20801203, -0x38810602, -0xb0408586, -0x28810602, -0x32004180, -0x34204702, +0x38820282, +0x41004003, +0xb0408189, +0x28820282, +0x3881c282, +0xb0408304, +0x2881c282, +0x00400000, +0x40800003, +0x35000000, +0x30809e03, +0x34000182, 0x21a00382, 0x4020007f, -0x327fdc80, +0x327fde00, 0x409ffe02, 0x30801203, -0x40800204, -0x3ec40087, -0x40800405, -0x00200000, -0x40800606, -0x3ac10608, -0x3ac14609, -0x3ac1860a, -0xb060c107, +0x40800206, +0x3ec40084, +0x40800407, +0x40800608, +0x3ac1828a, +0x3ac20289, +0xb060c104, +0x3ac1c284, 0x20801203, +0x38818282, 0x41004003, -0x38810602, -0x4020007f, -0xb0408188, -0x4020007f, -0x28810602, -0x41201002, -0x38814603, -0x10009c09, -0xb060c109, -0x4020007f, -0x28814603, +0xb040818a, +0x10005b0b, +0x41201003, +0x28818282, +0x3881c282, +0xb0408184, 0x41193f83, -0x38818602, 0x60ffc003, -0xb040818a, -0x28818602, -0x32003080, +0x2881c282, +0x38820282, +0xb0408189, +0x28820282, +0x327fef80, 0x409ffe02, 0x30801203, -0x40800204, -0x3ec40087, -0x41201008, -0x10009c14, -0x40800405, -0x3ac10609, -0x40800606, -0x3ac1460a, -0xb060c107, -0x3ac1860b, +0x40800207, +0x3ec40086, +0x4120100b, +0x10005b14, +0x40800404, +0x3ac1c289, +0x40800608, +0xb060c106, +0x3ac10286, +0x3ac2028a, 0x20801203, -0x38810602, -0xb0408409, -0x28810602, -0x38814603, -0xb060c40a, -0x4020007f, -0x28814603, +0x3881c282, 0x41193f83, -0x38818602, 0x60ffc003, -0xb040818b, -0x28818602, -0x32002380, -0x409ffe02, -0x30801204, -0x40800205, -0x3ec40083, -0x40800406, -0x3ac14607, -0x3ac18608, -0xb0810103, -0x41004002, -0x20801204, -0x4020007f, -0x38814603, -0x10009c0b, -0xb060c107, -0x4020007f, -0x4020007f, -0x28814603, -0x38818602, -0x4020007f, +0xb0408589, +0x2881c282, +0x38810282, +0xb0408586, +0x28810282, +0x38820282, +0xb040818a, +0x28820282, 0x4020007f, -0xb0408588, -0x28818602, +0x327fe280, +0x409ffe02, +0x30801203, +0x40800207, +0x3ec40084, +0x40800408, +0x10005b14, +0x40800609, +0x3ac1c28a, +0x3ac2028b, +0xb060c104, +0x3ac24284, +0x20801203, +0x41201003, +0x3881c282, +0xb040830a, +0x2881c282, +0x38820282, +0xb040818b, +0x41193f83, +0x60ffc003, +0x28820282, +0x38824282, +0xb0408184, +0x28824282, 0x4020007f, -0x32001780, +0x327fd580, 0x409ffe02, -0x1000640e, -0x40800204, +0x1000658e, +0x40800206, 0x30801203, -0x40800405, -0x3ec40087, -0x40800606, -0x3ac10608, -0x3ac14609, -0x3ac1860a, -0xb060c107, +0x40800407, +0x3ec40084, +0x40800608, +0x3ac1828a, +0x3ac20289, +0xb060c104, +0x3ac1c284, 0x20801203, 0x413d8003, -0x38810602, +0x38818282, 0x4020007f, -0x327fd780, -0x409ffe02, -0x10007f0c, -0x40800205, -0x30801204, -0x40800406, -0x3ec40083, -0x3ac14607, -0x3ac18608, -0xb0810103, -0x413d8002, -0x20801204, -0x38814603, +0x327fd800, +0x409ffe03, +0x30801202, +0x40800207, +0x3ec40084, +0x10005b09, +0x3ac1c288, +0xb0408184, 0x4020007f, -0x327feb80, +0x4020007f, +0x20801202, +0x3881c282, +0xb0408308, +0x2881c282, +0x327fc680, 0x409ffe02, +0x1000588b, +0x40800208, 0x30801203, -0x40800204, -0x3ec40087, -0x40800405, -0x1000650a, -0x40800606, -0x3ac10608, -0x3ac14609, -0x3ac1860a, -0xb060c107, +0x40800407, +0x3ec40084, +0x3ac20289, +0xb060c104, +0x3ac1c284, 0x20801203, -0x38810602, -0xb0408588, -0x4020007f, -0x327fc980, -0x00400000, -0x40800003, -0x4020007f, -0x35000000, +0x413d8003, +0x38820282, +0x327fbd80, +0x00200000, +0x00000da0, +0x00000000, +0x00000000, +0x00000000, +0x00000d90, +0x00000000, +0x00000000, +0x00000000, +0x00000db0, +0x00000000, +0x00000000, +0x00000000, +0x00000dc0, +0x00000000, +0x00000000, +0x00000000, +0x00000d80, +0x00000000, +0x00000000, +0x00000000, +0x00000df0, +0x00000000, +0x00000000, +0x00000000, +0x00000de0, +0x00000000, +0x00000000, +0x00000000, +0x00000dd0, +0x00000000, +0x00000000, +0x00000000, +0x00000e04, +0x00000000, +0x00000000, 0x00000000, +0x00000e00, 0x00000000, 0x00000000, 0x00000000, diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h index 08b3530..8b20c0c 100644 --- a/arch/powerpc/platforms/cell/spufs/spufs.h +++ b/arch/powerpc/platforms/cell/spufs/spufs.h @@ -40,17 +40,13 @@ enum { struct spu_context_ops; struct spu_gang; -/* - * This is the state for spu utilization reporting to userspace. - * Because this state is visible to userspace it must never change and needs - * to be kept strictly separate from any internal state kept by the kernel. - */ -enum spuctx_execution_state { - SPUCTX_UTIL_USER = 0, - SPUCTX_UTIL_SYSTEM, - SPUCTX_UTIL_IOWAIT, - SPUCTX_UTIL_LOADED, - SPUCTX_UTIL_MAX +enum { + SPU_SCHED_WAS_ACTIVE, /* was active upon spu_acquire_saved() */ +}; + +/* ctx->sched_flags */ +enum { + SPU_SCHED_NOTIFY_ACTIVE, }; struct spu_context { @@ -89,6 +85,8 @@ struct spu_context { struct list_head gang_list; struct spu_gang *gang; + struct kref *prof_priv_kref; + void ( * prof_priv_release) (struct kref *kref); /* owner thread */ pid_t tid; @@ -104,9 +102,9 @@ struct spu_context { /* statistics */ struct { /* updates protected by ctx->state_mutex */ - enum spuctx_execution_state execution_state; - unsigned long tstamp; /* time of last ctx switch */ - unsigned long times[SPUCTX_UTIL_MAX]; + enum spu_utilization_state util_state; + unsigned long long tstamp; /* time of last state switch */ + unsigned long long times[SPU_UTIL_MAX]; unsigned long long vol_ctx_switch; unsigned long long invol_ctx_switch; unsigned long long min_flt; @@ -118,6 +116,10 @@ struct spu_context { unsigned long long class2_intr_base; /* # at last ctx switch */ unsigned long long libassist; } stats; + + struct list_head aff_list; + int aff_head; + int aff_offset; }; struct spu_gang { @@ -125,8 +127,19 @@ struct spu_gang { struct mutex mutex; struct kref kref; int contexts; + + struct spu_context *aff_ref_ctx; + struct list_head aff_list_head; + struct mutex aff_mutex; + int aff_flags; + struct spu *aff_ref_spu; + atomic_t aff_sched_count; }; +/* Flag bits for spu_gang aff_flags */ +#define AFF_OFFSETS_SET 1 +#define AFF_MERGED 2 + struct mfc_dma_command { int32_t pad; /* reserved */ uint32_t lsa; /* local storage address */ @@ -190,10 +203,9 @@ extern struct tree_descr spufs_dir_contents[]; extern struct tree_descr spufs_dir_nosched_contents[]; /* system call implementation */ -long spufs_run_spu(struct file *file, - struct spu_context *ctx, u32 *npc, u32 *status); -long spufs_create(struct nameidata *nd, - unsigned int flags, mode_t mode); +long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status); +long spufs_create(struct nameidata *nd, unsigned int flags, + mode_t mode, struct file *filp); extern const struct file_operations spufs_context_fops; /* gang management */ @@ -206,6 +218,9 @@ void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx); /* fault handling */ int spufs_handle_class1(struct spu_context *ctx); +/* affinity */ +struct spu *affinity_check(struct spu_context *ctx); + /* context management */ extern atomic_t nr_spu_contexts; static inline void spu_acquire(struct spu_context *ctx) @@ -227,15 +242,17 @@ void spu_unmap_mappings(struct spu_context *ctx); void spu_forget(struct spu_context *ctx); int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags); void spu_acquire_saved(struct spu_context *ctx); +void spu_release_saved(struct spu_context *ctx); int spu_activate(struct spu_context *ctx, unsigned long flags); void spu_deactivate(struct spu_context *ctx); void spu_yield(struct spu_context *ctx); +void spu_switch_notify(struct spu *spu, struct spu_context *ctx); void spu_set_timeslice(struct spu_context *ctx); void spu_update_sched_info(struct spu_context *ctx); void __spu_update_sched_info(struct spu_context *ctx); int __init spu_sched_init(void); -void __exit spu_sched_exit(void); +void spu_sched_exit(void); extern char *isolated_loader; @@ -293,30 +310,34 @@ extern int spufs_coredump_num_notes; * line. */ static inline void spuctx_switch_state(struct spu_context *ctx, - enum spuctx_execution_state new_state) + enum spu_utilization_state new_state) { - WARN_ON(!mutex_is_locked(&ctx->state_mutex)); - - if (ctx->stats.execution_state != new_state) { - unsigned long curtime = jiffies; - - ctx->stats.times[ctx->stats.execution_state] += - curtime - ctx->stats.tstamp; - ctx->stats.tstamp = curtime; - ctx->stats.execution_state = new_state; - } -} + unsigned long long curtime; + signed long long delta; + struct timespec ts; + struct spu *spu; + enum spu_utilization_state old_state; -static inline void spu_switch_state(struct spu *spu, - enum spuctx_execution_state new_state) -{ - if (spu->stats.utilization_state != new_state) { - unsigned long curtime = jiffies; + ktime_get_ts(&ts); + curtime = timespec_to_ns(&ts); + delta = curtime - ctx->stats.tstamp; - spu->stats.times[spu->stats.utilization_state] += - curtime - spu->stats.tstamp; + WARN_ON(!mutex_is_locked(&ctx->state_mutex)); + WARN_ON(delta < 0); + + spu = ctx->spu; + old_state = ctx->stats.util_state; + ctx->stats.util_state = new_state; + ctx->stats.tstamp = curtime; + + /* + * Update the physical SPU utilization statistics. + */ + if (spu) { + ctx->stats.times[old_state] += delta; + spu->stats.times[old_state] += delta; + spu->stats.util_state = new_state; spu->stats.tstamp = curtime; - spu->stats.utilization_state = new_state; } } diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c index 9c506ba..27ffdae 100644 --- a/arch/powerpc/platforms/cell/spufs/switch.c +++ b/arch/powerpc/platforms/cell/spufs/switch.c @@ -180,7 +180,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu) case MFC_CNTL_SUSPEND_COMPLETE: if (csa) { csa->priv2.mfc_control_RW = - in_be64(&priv2->mfc_control_RW) | + MFC_CNTL_SUSPEND_MASK | MFC_CNTL_SUSPEND_DMA_QUEUE; } break; @@ -190,9 +190,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu) MFC_CNTL_SUSPEND_DMA_STATUS_MASK) == MFC_CNTL_SUSPEND_COMPLETE); if (csa) { - csa->priv2.mfc_control_RW = - in_be64(&priv2->mfc_control_RW) & - ~MFC_CNTL_SUSPEND_DMA_QUEUE; + csa->priv2.mfc_control_RW = 0; } break; } @@ -251,16 +249,8 @@ static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu) * Read MFC_CNTL[Ds]. Update saved copy of * CSA.MFC_CNTL[Ds]. */ - if (in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING) { - csa->priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING; - csa->suspend_time = get_cycles(); - out_be64(&priv2->spu_chnlcntptr_RW, 7ULL); - eieio(); - csa->spu_chnldata_RW[7] = in_be64(&priv2->spu_chnldata_RW); - eieio(); - } else { - csa->priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING; - } + csa->priv2.mfc_control_RW |= + in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING; } static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu) @@ -271,7 +261,8 @@ static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu) * Write MFC_CNTL[Dh] set to a '1' to halt * the decrementer. */ - out_be64(&priv2->mfc_control_RW, MFC_CNTL_DECREMENTER_HALTED); + out_be64(&priv2->mfc_control_RW, + MFC_CNTL_DECREMENTER_HALTED | MFC_CNTL_SUSPEND_MASK); eieio(); } @@ -615,7 +606,7 @@ static inline void save_ppuint_mb(struct spu_state *csa, struct spu *spu) static inline void save_ch_part1(struct spu_state *csa, struct spu *spu) { struct spu_priv2 __iomem *priv2 = spu->priv2; - u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; + u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; int i; /* Save, Step 42: @@ -626,7 +617,7 @@ static inline void save_ch_part1(struct spu_state *csa, struct spu *spu) csa->spu_chnldata_RW[1] = in_be64(&priv2->spu_chnldata_RW); /* Save the following CH: [0,3,4,24,25,27] */ - for (i = 0; i < 7; i++) { + for (i = 0; i < ARRAY_SIZE(ch_indices); i++) { idx = ch_indices[i]; out_be64(&priv2->spu_chnlcntptr_RW, idx); eieio(); @@ -983,13 +974,13 @@ static inline void terminate_spu_app(struct spu_state *csa, struct spu *spu) */ } -static inline void suspend_mfc(struct spu_state *csa, struct spu *spu) +static inline void suspend_mfc_and_halt_decr(struct spu_state *csa, + struct spu *spu) { struct spu_priv2 __iomem *priv2 = spu->priv2; /* Restore, Step 7: - * Restore, Step 47. - * Write MFC_Cntl[Dh,Sc]='1','1' to suspend + * Write MFC_Cntl[Dh,Sc,Sm]='1','1','0' to suspend * the queue and halt the decrementer. */ out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE | @@ -1090,7 +1081,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu) static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu) { struct spu_priv2 __iomem *priv2 = spu->priv2; - u64 ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; + u64 ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; u64 idx; int i; @@ -1102,7 +1093,7 @@ static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu) out_be64(&priv2->spu_chnldata_RW, 0UL); /* Reset the following CH: [0,3,4,24,25,27] */ - for (i = 0; i < 7; i++) { + for (i = 0; i < ARRAY_SIZE(ch_indices); i++) { idx = ch_indices[i]; out_be64(&priv2->spu_chnlcntptr_RW, idx); eieio(); @@ -1289,7 +1280,15 @@ static inline void setup_decr(struct spu_state *csa, struct spu *spu) cycles_t resume_time = get_cycles(); cycles_t delta_time = resume_time - csa->suspend_time; + csa->lscsa->decr_status.slot[0] = SPU_DECR_STATUS_RUNNING; + if (csa->lscsa->decr.slot[0] < delta_time) { + csa->lscsa->decr_status.slot[0] |= + SPU_DECR_STATUS_WRAPPED; + } + csa->lscsa->decr.slot[0] -= delta_time; + } else { + csa->lscsa->decr_status.slot[0] = 0; } } @@ -1398,6 +1397,18 @@ static inline void restore_ls_16kb(struct spu_state *csa, struct spu *spu) send_mfc_dma(spu, addr, ls_offset, size, tag, rclass, cmd); } +static inline void suspend_mfc(struct spu_state *csa, struct spu *spu) +{ + struct spu_priv2 __iomem *priv2 = spu->priv2; + + /* Restore, Step 47. + * Write MFC_Cntl[Sc,Sm]='1','0' to suspend + * the queue. + */ + out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE); + eieio(); +} + static inline void clear_interrupts(struct spu_state *csa, struct spu *spu) { /* Restore, Step 49: @@ -1548,10 +1559,10 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu) * "wrapped" flag is set, OR in a '1' to * CSA.SPU_Event_Status[Tm]. */ - if (csa->lscsa->decr_status.slot[0] == 1) { + if (csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) { csa->spu_chnldata_RW[0] |= 0x20; } - if ((csa->lscsa->decr_status.slot[0] == 1) && + if ((csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) && (csa->spu_chnlcnt_RW[0] == 0 && ((csa->spu_chnldata_RW[2] & 0x20) == 0x0) && ((csa->spu_chnldata_RW[0] & 0x20) != 0x1))) { @@ -1562,18 +1573,13 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu) static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu) { struct spu_priv2 __iomem *priv2 = spu->priv2; - u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; + u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL }; int i; /* Restore, Step 59: + * Restore the following CH: [0,3,4,24,25,27] */ - - /* Restore CH 1 without count */ - out_be64(&priv2->spu_chnlcntptr_RW, 1); - out_be64(&priv2->spu_chnldata_RW, csa->spu_chnldata_RW[1]); - - /* Restore the following CH: [0,3,4,24,25,27] */ - for (i = 0; i < 7; i++) { + for (i = 0; i < ARRAY_SIZE(ch_indices); i++) { idx = ch_indices[i]; out_be64(&priv2->spu_chnlcntptr_RW, idx); eieio(); @@ -1932,7 +1938,7 @@ static void harvest(struct spu_state *prev, struct spu *spu) set_switch_pending(prev, spu); /* Step 5. */ stop_spu_isolate(spu); /* NEW. */ remove_other_spu_access(prev, spu); /* Step 6. */ - suspend_mfc(prev, spu); /* Step 7. */ + suspend_mfc_and_halt_decr(prev, spu); /* Step 7. */ wait_suspend_mfc_complete(prev, spu); /* Step 8. */ if (!suspend_spe(prev, spu)) /* Step 9. */ clear_spu_status(prev, spu); /* Step 10. */ diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 8e37bdf..43f0fb8 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -47,7 +47,7 @@ static long do_spu_run(struct file *filp, goto out; i = SPUFS_I(filp->f_path.dentry->d_inode); - ret = spufs_run_spu(filp, i->i_ctx, &npc, &status); + ret = spufs_run_spu(i->i_ctx, &npc, &status); if (put_user(npc, unpc)) ret = -EFAULT; @@ -76,8 +76,8 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus) } #endif -asmlinkage long sys_spu_create(const char __user *pathname, - unsigned int flags, mode_t mode) +asmlinkage long do_spu_create(const char __user *pathname, unsigned int flags, + mode_t mode, struct file *neighbor) { char *tmp; int ret; @@ -90,7 +90,7 @@ asmlinkage long sys_spu_create(const char __user *pathname, ret = path_lookup(tmp, LOOKUP_PARENT| LOOKUP_OPEN|LOOKUP_CREATE, &nd); if (!ret) { - ret = spufs_create(&nd, flags, mode); + ret = spufs_create(&nd, flags, mode, neighbor); path_release(&nd); } putname(tmp); @@ -99,8 +99,32 @@ asmlinkage long sys_spu_create(const char __user *pathname, return ret; } +#ifndef MODULE +asmlinkage long sys_spu_create(const char __user *pathname, unsigned int flags, + mode_t mode, int neighbor_fd) +{ + int fput_needed; + struct file *neighbor; + long ret; + + if (flags & SPU_CREATE_AFFINITY_SPU) { + ret = -EBADF; + neighbor = fget_light(neighbor_fd, &fput_needed); + if (neighbor) { + ret = do_spu_create(pathname, flags, mode, neighbor); + fput_light(neighbor, fput_needed); + } + } + else { + ret = do_spu_create(pathname, flags, mode, NULL); + } + + return ret; +} +#endif + struct spufs_calls spufs_calls = { - .create_thread = sys_spu_create, + .create_thread = do_spu_create, .spu_run = do_spu_run, .owner = THIS_MODULE, }; diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index f65078c..484eb4e 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_QUICC_ENGINE) += qe_lib/ mv64x60-$(CONFIG_PCI) += mv64x60_pci.o obj-$(CONFIG_MV64X60) += $(mv64x60-y) mv64x60_pic.o mv64x60_dev.o obj-$(CONFIG_RTC_DRV_CMOS) += rtc_cmos_setup.o +obj-$(CONFIG_AXON_RAM) += axonram.o # contains only the suspend handler for time ifeq ($(CONFIG_RTC_CLASS),) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c new file mode 100644 index 0000000..2326d5d --- /dev/null +++ b/arch/powerpc/sysdev/axonram.c @@ -0,0 +1,381 @@ +/* + * (C) Copyright IBM Deutschland Entwicklung GmbH 2006 + * + * Author: Maxim Shchetynin <maxim@de.ibm.com> + * + * Axon DDR2 device driver. + * It registers one block device per Axon's DDR2 memory bank found on a system. + * Block devices are called axonram?, their major and minor numbers are + * available in /proc/devices, /proc/partitions or in /sys/block/axonram?/dev. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/device.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/irq.h> +#include <linux/irqreturn.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> +#include <asm/of_device.h> +#include <asm/of_platform.h> +#include <asm/page.h> +#include <asm/prom.h> + +#define AXON_RAM_MODULE_NAME "axonram" +#define AXON_RAM_DEVICE_NAME "axonram" +#define AXON_RAM_MINORS_PER_DISK 16 +#define AXON_RAM_BLOCK_SHIFT PAGE_SHIFT +#define AXON_RAM_BLOCK_SIZE 1 << AXON_RAM_BLOCK_SHIFT +#define AXON_RAM_SECTOR_SHIFT 9 +#define AXON_RAM_SECTOR_SIZE 1 << AXON_RAM_SECTOR_SHIFT +#define AXON_RAM_IRQ_FLAGS IRQF_SHARED | IRQF_TRIGGER_RISING + +struct axon_ram_bank { + struct of_device *device; + struct gendisk *disk; + unsigned int irq_correctable; + unsigned int irq_uncorrectable; + unsigned long ph_addr; + unsigned long io_addr; + unsigned long size; + unsigned long ecc_counter; +}; + +static ssize_t +axon_ram_sysfs_ecc(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct of_device *device = to_of_device(dev); + struct axon_ram_bank *bank = device->dev.platform_data; + + BUG_ON(!bank); + + return sprintf(buf, "%ld\n", bank->ecc_counter); +} + +static DEVICE_ATTR(ecc, S_IRUGO, axon_ram_sysfs_ecc, NULL); + +/** + * axon_ram_irq_handler - interrupt handler for Axon RAM ECC + * @irq: interrupt ID + * @dev: pointer to of_device + */ +static irqreturn_t +axon_ram_irq_handler(int irq, void *dev) +{ + struct of_device *device = dev; + struct axon_ram_bank *bank = device->dev.platform_data; + + BUG_ON(!bank); + + if (irq == bank->irq_correctable) { + dev_err(&device->dev, "Correctable memory error occured\n"); + bank->ecc_counter++; + return IRQ_HANDLED; + } else if (irq == bank->irq_uncorrectable) { + dev_err(&device->dev, "Uncorrectable memory error occured\n"); + panic("Critical ECC error on %s", device->node->full_name); + } + + return IRQ_NONE; +} + +/** + * axon_ram_make_request - make_request() method for block device + * @queue, @bio: see blk_queue_make_request() + */ +static int +axon_ram_make_request(struct request_queue *queue, struct bio *bio) +{ + struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data; + unsigned long phys_mem, phys_end; + void *user_mem; + struct bio_vec *vec; + unsigned int transfered; + unsigned short idx; + int rc = 0; + + phys_mem = bank->io_addr + (bio->bi_sector << AXON_RAM_SECTOR_SHIFT); + phys_end = bank->io_addr + bank->size; + transfered = 0; + bio_for_each_segment(vec, bio, idx) { + if (unlikely(phys_mem + vec->bv_len > phys_end)) { + bio_io_error(bio, bio->bi_size); + rc = -ERANGE; + break; + } + + user_mem = page_address(vec->bv_page) + vec->bv_offset; + if (bio_data_dir(bio) == READ) + memcpy(user_mem, (void *) phys_mem, vec->bv_len); + else + memcpy((void *) phys_mem, user_mem, vec->bv_len); + + phys_mem += vec->bv_len; + transfered += vec->bv_len; + } + bio_endio(bio, transfered, 0); + + return rc; +} + +/** + * axon_ram_direct_access - direct_access() method for block device + * @device, @sector, @data: see block_device_operations method + */ +static int +axon_ram_direct_access(struct block_device *device, sector_t sector, + unsigned long *data) +{ + struct axon_ram_bank *bank = device->bd_disk->private_data; + loff_t offset; + + offset = sector << AXON_RAM_SECTOR_SHIFT; + if (offset >= bank->size) { + dev_err(&bank->device->dev, "Access outside of address space\n"); + return -ERANGE; + } + + *data = bank->ph_addr + offset; + + return 0; +} + +static struct block_device_operations axon_ram_devops = { + .owner = THIS_MODULE, + .direct_access = axon_ram_direct_access +}; + +/** + * axon_ram_probe - probe() method for platform driver + * @device, @device_id: see of_platform_driver method + */ +static int +axon_ram_probe(struct of_device *device, const struct of_device_id *device_id) +{ + static int axon_ram_bank_id = -1; + struct axon_ram_bank *bank; + struct resource resource; + int rc = 0; + + axon_ram_bank_id++; + + dev_info(&device->dev, "Found memory controller on %s\n", + device->node->full_name); + + bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL); + if (bank == NULL) { + dev_err(&device->dev, "Out of memory\n"); + rc = -ENOMEM; + goto failed; + } + + device->dev.platform_data = bank; + + bank->device = device; + + if (of_address_to_resource(device->node, 0, &resource) != 0) { + dev_err(&device->dev, "Cannot access device tree\n"); + rc = -EFAULT; + goto failed; + } + + bank->size = resource.end - resource.start + 1; + + if (bank->size == 0) { + dev_err(&device->dev, "No DDR2 memory found for %s%d\n", + AXON_RAM_DEVICE_NAME, axon_ram_bank_id); + rc = -ENODEV; + goto failed; + } + + dev_info(&device->dev, "Register DDR2 memory device %s%d with %luMB\n", + AXON_RAM_DEVICE_NAME, axon_ram_bank_id, bank->size >> 20); + + bank->ph_addr = resource.start; + bank->io_addr = (unsigned long) ioremap_flags( + bank->ph_addr, bank->size, _PAGE_NO_CACHE); + if (bank->io_addr == 0) { + dev_err(&device->dev, "ioremap() failed\n"); + rc = -EFAULT; + goto failed; + } + + bank->disk = alloc_disk(AXON_RAM_MINORS_PER_DISK); + if (bank->disk == NULL) { + dev_err(&device->dev, "Cannot register disk\n"); + rc = -EFAULT; + goto failed; + } + + bank->disk->first_minor = 0; + bank->disk->fops = &axon_ram_devops; + bank->disk->private_data = bank; + bank->disk->driverfs_dev = &device->dev; + + sprintf(bank->disk->disk_name, "%s%d", + AXON_RAM_DEVICE_NAME, axon_ram_bank_id); + bank->disk->major = register_blkdev(0, bank->disk->disk_name); + if (bank->disk->major < 0) { + dev_err(&device->dev, "Cannot register block device\n"); + rc = -EFAULT; + goto failed; + } + + bank->disk->queue = blk_alloc_queue(GFP_KERNEL); + if (bank->disk->queue == NULL) { + dev_err(&device->dev, "Cannot register disk queue\n"); + rc = -EFAULT; + goto failed; + } + + set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT); + blk_queue_make_request(bank->disk->queue, axon_ram_make_request); + blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE); + add_disk(bank->disk); + + bank->irq_correctable = irq_of_parse_and_map(device->node, 0); + bank->irq_uncorrectable = irq_of_parse_and_map(device->node, 1); + if ((bank->irq_correctable <= 0) || (bank->irq_uncorrectable <= 0)) { + dev_err(&device->dev, "Cannot access ECC interrupt ID\n"); + rc = -EFAULT; + goto failed; + } + + rc = request_irq(bank->irq_correctable, axon_ram_irq_handler, + AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device); + if (rc != 0) { + dev_err(&device->dev, "Cannot register ECC interrupt handler\n"); + bank->irq_correctable = bank->irq_uncorrectable = 0; + rc = -EFAULT; + goto failed; + } + + rc = request_irq(bank->irq_uncorrectable, axon_ram_irq_handler, + AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device); + if (rc != 0) { + dev_err(&device->dev, "Cannot register ECC interrupt handler\n"); + bank->irq_uncorrectable = 0; + rc = -EFAULT; + goto failed; + } + + rc = device_create_file(&device->dev, &dev_attr_ecc); + if (rc != 0) { + dev_err(&device->dev, "Cannot create sysfs file\n"); + rc = -EFAULT; + goto failed; + } + + return 0; + +failed: + if (bank != NULL) { + if (bank->irq_uncorrectable > 0) + free_irq(bank->irq_uncorrectable, device); + if (bank->irq_correctable > 0) + free_irq(bank->irq_correctable, device); + if (bank->disk != NULL) { + if (bank->disk->queue != NULL) + blk_cleanup_queue(bank->disk->queue); + if (bank->disk->major > 0) + unregister_blkdev(bank->disk->major, + bank->disk->disk_name); + del_gendisk(bank->disk); + } + device->dev.platform_data = NULL; + if (bank->io_addr != 0) + iounmap((void __iomem *) bank->io_addr); + kfree(bank); + } + + return rc; +} + +/** + * axon_ram_remove - remove() method for platform driver + * @device: see of_platform_driver method + */ +static int +axon_ram_remove(struct of_device *device) +{ + struct axon_ram_bank *bank = device->dev.platform_data; + + BUG_ON(!bank || !bank->disk); + + device_remove_file(&device->dev, &dev_attr_ecc); + free_irq(bank->irq_uncorrectable, device); + free_irq(bank->irq_correctable, device); + blk_cleanup_queue(bank->disk->queue); + unregister_blkdev(bank->disk->major, bank->disk->disk_name); + del_gendisk(bank->disk); + iounmap((void __iomem *) bank->io_addr); + kfree(bank); + + return 0; +} + +static struct of_device_id axon_ram_device_id[] = { + { + .type = "dma-memory" + }, + {} +}; + +static struct of_platform_driver axon_ram_driver = { + .owner = THIS_MODULE, + .name = AXON_RAM_MODULE_NAME, + .match_table = axon_ram_device_id, + .probe = axon_ram_probe, + .remove = axon_ram_remove +}; + +/** + * axon_ram_init + */ +static int __init +axon_ram_init(void) +{ + return of_register_platform_driver(&axon_ram_driver); +} + +/** + * axon_ram_exit + */ +static void __exit +axon_ram_exit(void) +{ + of_unregister_platform_driver(&axon_ram_driver); +} + +module_init(axon_ram_init); +module_exit(axon_ram_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Maxim Shchetynin <maxim@de.ibm.com>"); +MODULE_DESCRIPTION("Axon DDR2 RAM device driver for IBM Cell BE"); diff --git a/arch/powerpc/sysdev/pmi.c b/arch/powerpc/sysdev/pmi.c index 85a7c99..2f91b55 100644 --- a/arch/powerpc/sysdev/pmi.c +++ b/arch/powerpc/sysdev/pmi.c @@ -48,15 +48,13 @@ struct pmi_data { struct work_struct work; }; +static struct pmi_data *data; static int pmi_irq_handler(int irq, void *dev_id) { - struct pmi_data *data; u8 type; int rc; - data = dev_id; - spin_lock(&data->pmi_spinlock); type = ioread8(data->pmi_reg + PMI_READ_TYPE); @@ -111,16 +109,13 @@ MODULE_DEVICE_TABLE(of, pmi_match); static void pmi_notify_handlers(struct work_struct *work) { - struct pmi_data *data; struct pmi_handler *handler; - data = container_of(work, struct pmi_data, work); - spin_lock(&data->handler_spinlock); list_for_each_entry(handler, &data->handler, node) { pr_debug(KERN_INFO "pmi: notifying handler %p\n", handler); if (handler->type == data->msg.type) - handler->handle_pmi_message(data->dev, data->msg); + handler->handle_pmi_message(data->msg); } spin_unlock(&data->handler_spinlock); } @@ -129,9 +124,14 @@ static int pmi_of_probe(struct of_device *dev, const struct of_device_id *match) { struct device_node *np = dev->node; - struct pmi_data *data; int rc; + if (data) { + printk(KERN_ERR "pmi: driver has already been initialized.\n"); + rc = -EBUSY; + goto out; + } + data = kzalloc(sizeof(struct pmi_data), GFP_KERNEL); if (!data) { printk(KERN_ERR "pmi: could not allocate memory.\n"); @@ -154,7 +154,6 @@ static int pmi_of_probe(struct of_device *dev, INIT_WORK(&data->work, pmi_notify_handlers); - dev->dev.driver_data = data; data->dev = dev; data->irq = irq_of_parse_and_map(np, 0); @@ -164,7 +163,7 @@ static int pmi_of_probe(struct of_device *dev, goto error_cleanup_iomap; } - rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", data); + rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", NULL); if (rc) { printk(KERN_ERR "pmi: can't request IRQ %d: returned %d\n", data->irq, rc); @@ -187,12 +186,9 @@ out: static int pmi_of_remove(struct of_device *dev) { - struct pmi_data *data; struct pmi_handler *handler, *tmp; - data = dev->dev.driver_data; - - free_irq(data->irq, data); + free_irq(data->irq, NULL); iounmap(data->pmi_reg); spin_lock(&data->handler_spinlock); @@ -202,7 +198,8 @@ static int pmi_of_remove(struct of_device *dev) spin_unlock(&data->handler_spinlock); - kfree(dev->dev.driver_data); + kfree(data); + data = NULL; return 0; } @@ -226,13 +223,13 @@ static void __exit pmi_module_exit(void) } module_exit(pmi_module_exit); -void pmi_send_message(struct of_device *device, pmi_message_t msg) +int pmi_send_message(pmi_message_t msg) { - struct pmi_data *data; unsigned long flags; DECLARE_COMPLETION_ONSTACK(completion); - data = device->dev.driver_data; + if (!data) + return -ENODEV; mutex_lock(&data->msg_mutex); @@ -256,30 +253,26 @@ void pmi_send_message(struct of_device *device, pmi_message_t msg) data->completion = NULL; mutex_unlock(&data->msg_mutex); + + return 0; } EXPORT_SYMBOL_GPL(pmi_send_message); -void pmi_register_handler(struct of_device *device, - struct pmi_handler *handler) +int pmi_register_handler(struct pmi_handler *handler) { - struct pmi_data *data; - data = device->dev.driver_data; - if (!data) - return; + return -ENODEV; spin_lock(&data->handler_spinlock); list_add_tail(&handler->node, &data->handler); spin_unlock(&data->handler_spinlock); + + return 0; } EXPORT_SYMBOL_GPL(pmi_register_handler); -void pmi_unregister_handler(struct of_device *device, - struct pmi_handler *handler) +void pmi_unregister_handler(struct pmi_handler *handler) { - struct pmi_data *data; - data = device->dev.driver_data; - if (!data) return; |