summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
committerTimothy Pearson <tpearson@raptorengineering.com>2017-08-23 14:45:25 -0500
commitfcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree22962a4387943edc841c72a4e636a068c66d58fd /kernel
downloadast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip
ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz
Initial import of modified Linux 2.6.28 tree
Original upstream URL: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore6
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Kconfig.hz58
-rw-r--r--kernel/Kconfig.preempt79
-rw-r--r--kernel/Makefile122
-rw-r--r--kernel/acct.c676
-rw-r--r--kernel/audit.c1520
-rw-r--r--kernel/audit.h169
-rw-r--r--kernel/audit_tree.c919
-rw-r--r--kernel/auditfilter.c1854
-rw-r--r--kernel/auditsc.c2489
-rw-r--r--kernel/backtracetest.c91
-rw-r--r--kernel/bounds.c19
-rw-r--r--kernel/capability.c507
-rw-r--r--kernel/cgroup.c3184
-rw-r--r--kernel/cgroup_debug.c107
-rw-r--r--kernel/cgroup_freezer.c379
-rw-r--r--kernel/compat.c1116
-rw-r--r--kernel/configs.c98
-rw-r--r--kernel/cpu.c504
-rw-r--r--kernel/cpuset.c2467
-rw-r--r--kernel/delayacct.c183
-rw-r--r--kernel/dma-coherent.c155
-rw-r--r--kernel/dma.c161
-rw-r--r--kernel/exec_domain.c228
-rw-r--r--kernel/exit.c1841
-rw-r--r--kernel/extable.c68
-rw-r--r--kernel/fork.c1718
-rw-r--r--kernel/freezer.c154
-rw-r--r--kernel/futex.c2099
-rw-r--r--kernel/futex_compat.c195
-rw-r--r--kernel/hrtimer.c1924
-rw-r--r--kernel/irq/Makefile5
-rw-r--r--kernel/irq/autoprobe.c191
-rw-r--r--kernel/irq/chip.c631
-rw-r--r--kernel/irq/devres.c90
-rw-r--r--kernel/irq/handle.c277
-rw-r--r--kernel/irq/internals.h69
-rw-r--r--kernel/irq/manage.c736
-rw-r--r--kernel/irq/migration.c64
-rw-r--r--kernel/irq/proc.c249
-rw-r--r--kernel/irq/resend.c82
-rw-r--r--kernel/irq/spurious.c300
-rw-r--r--kernel/itimer.c282
-rw-r--r--kernel/kallsyms.c487
-rw-r--r--kernel/kexec.c1499
-rw-r--r--kernel/kfifo.c197
-rw-r--r--kernel/kgdb.c1736
-rw-r--r--kernel/kmod.c510
-rw-r--r--kernel/kprobes.c1355
-rw-r--r--kernel/ksysfs.c197
-rw-r--r--kernel/kthread.c267
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/lockdep.c3482
-rw-r--r--kernel/lockdep_internals.h97
-rw-r--r--kernel/lockdep_proc.c746
-rw-r--r--kernel/marker.c868
-rw-r--r--kernel/module.c2821
-rw-r--r--kernel/mutex-debug.c116
-rw-r--r--kernel/mutex-debug.h53
-rw-r--r--kernel/mutex.c387
-rw-r--r--kernel/mutex.h30
-rw-r--r--kernel/notifier.c578
-rw-r--r--kernel/ns_cgroup.c106
-rw-r--r--kernel/nsproxy.c246
-rw-r--r--kernel/panic.c376
-rw-r--r--kernel/params.c761
-rw-r--r--kernel/pid.c530
-rw-r--r--kernel/pid_namespace.c192
-rw-r--r--kernel/pm_qos_params.c423
-rw-r--r--kernel/posix-cpu-timers.c1672
-rw-r--r--kernel/posix-timers.c1000
-rw-r--r--kernel/power/Kconfig206
-rw-r--r--kernel/power/Makefile10
-rw-r--r--kernel/power/console.c83
-rw-r--r--kernel/power/disk.c901
-rw-r--r--kernel/power/main.c732
-rw-r--r--kernel/power/power.h225
-rw-r--r--kernel/power/poweroff.c46
-rw-r--r--kernel/power/process.c154
-rw-r--r--kernel/power/snapshot.c1974
-rw-r--r--kernel/power/swap.c647
-rw-r--r--kernel/power/swsusp.c264
-rw-r--r--kernel/power/user.c443
-rw-r--r--kernel/printk.c1299
-rw-r--r--kernel/profile.c616
-rw-r--r--kernel/ptrace.c712
-rw-r--r--kernel/rcuclassic.c786
-rw-r--r--kernel/rcupdate.c170
-rw-r--r--kernel/rcupreempt.c1482
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c1157
-rw-r--r--kernel/relay.c1360
-rw-r--r--kernel/res_counter.c139
-rw-r--r--kernel/resource.c878
-rw-r--r--kernel/rtmutex-debug.c239
-rw-r--r--kernel/rtmutex-debug.h33
-rw-r--r--kernel/rtmutex-tester.c443
-rw-r--r--kernel/rtmutex.c1006
-rw-r--r--kernel/rtmutex.h26
-rw-r--r--kernel/rtmutex_common.h130
-rw-r--r--kernel/rwsem.c148
-rw-r--r--kernel/sched.c9390
-rw-r--r--kernel/sched_clock.c266
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c469
-rw-r--r--kernel/sched_fair.c1759
-rw-r--r--kernel/sched_features.h15
-rw-r--r--kernel/sched_idletask.c128
-rw-r--r--kernel/sched_rt.c1546
-rw-r--r--kernel/sched_stats.h372
-rw-r--r--kernel/seccomp.c86
-rw-r--r--kernel/semaphore.c263
-rw-r--r--kernel/signal.c2592
-rw-r--r--kernel/smp.c437
-rw-r--r--kernel/softirq.c799
-rw-r--r--kernel/softlockup.c371
-rw-r--r--kernel/spinlock.c455
-rw-r--r--kernel/srcu.c257
-rw-r--r--kernel/stacktrace.c26
-rw-r--r--kernel/stop_machine.c164
-rw-r--r--kernel/sys.c1800
-rw-r--r--kernel/sys_ni.c177
-rw-r--r--kernel/sysctl.c3031
-rw-r--r--kernel/sysctl_check.c1545
-rw-r--r--kernel/taskstats.c627
-rw-r--r--kernel/test_kprobes.c228
-rw-r--r--kernel/time.c689
-rw-r--r--kernel/time/Kconfig29
-rw-r--r--kernel/time/Makefile8
-rw-r--r--kernel/time/clockevents.c250
-rw-r--r--kernel/time/clocksource.c554
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/time/ntp.c453
-rw-r--r--kernel/time/tick-broadcast.c601
-rw-r--r--kernel/time/tick-common.c394
-rw-r--r--kernel/time/tick-internal.h135
-rw-r--r--kernel/time/tick-oneshot.c141
-rw-r--r--kernel/time/tick-sched.c770
-rw-r--r--kernel/time/timekeeping.c607
-rw-r--r--kernel/time/timer_list.c294
-rw-r--r--kernel/time/timer_stats.c423
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c1598
-rw-r--r--kernel/trace/Kconfig198
-rw-r--r--kernel/trace/Makefile28
-rw-r--r--kernel/trace/ftrace.c1451
-rw-r--r--kernel/trace/ring_buffer.c2201
-rw-r--r--kernel/trace/trace.c3235
-rw-r--r--kernel/trace/trace.h422
-rw-r--r--kernel/trace/trace_boot.c126
-rw-r--r--kernel/trace/trace_functions.c81
-rw-r--r--kernel/trace/trace_irqsoff.c493
-rw-r--r--kernel/trace/trace_mmiotrace.c375
-rw-r--r--kernel/trace/trace_nop.c64
-rw-r--r--kernel/trace/trace_sched_switch.c211
-rw-r--r--kernel/trace/trace_sched_wakeup.c393
-rw-r--r--kernel/trace/trace_selftest.c524
-rw-r--r--kernel/trace/trace_selftest_dynamic.c7
-rw-r--r--kernel/trace/trace_stack.c320
-rw-r--r--kernel/trace/trace_sysprof.c363
-rw-r--r--kernel/tracepoint.c485
-rw-r--r--kernel/tsacct.c153
-rw-r--r--kernel/uid16.c230
-rw-r--r--kernel/user.c526
-rw-r--r--kernel/user_namespace.c76
-rw-r--r--kernel/utsname.c66
-rw-r--r--kernel/utsname_sysctl.c145
-rw-r--r--kernel/wait.c288
-rw-r--r--kernel/workqueue.c1026
171 files changed, 113579 insertions, 0 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 0000000..ab4f109
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,6 @@
+#
+# Generated files
+#
+config_data.h
+config_data.gz
+timeconst.h
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 0000000..a3bb4cb
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
+config FREEZER
+ def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 0000000..94fabd5
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,58 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+ prompt "Timer frequency"
+ default HZ_250
+ help
+ Allows the configuration of the timer frequency. It is customary
+ to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+ beneficial for servers and NUMA systems that do not need to have
+ a fast response for user interaction and that may experience bus
+ contention and cacheline bounces as a result of timer interrupts.
+ Note that the timer interrupt occurs on each processor in an SMP
+ environment leading to NR_CPUS * HZ number of timer interrupts
+ per second.
+
+
+ config HZ_100
+ bool "100 HZ"
+ help
+ 100 Hz is a typical choice for servers, SMP and NUMA systems
+ with lots of processors that may show reduced performance if
+ too many timer interrupts are occurring.
+
+ config HZ_250
+ bool "250 HZ"
+ help
+ 250 Hz is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems. If you are going to be using NTSC video
+ or multimedia, selected 300Hz instead.
+
+ config HZ_300
+ bool "300 HZ"
+ help
+ 300 Hz is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems and exactly dividing by both PAL and
+ NTSC frame rates for video and multimedia work.
+
+ config HZ_1000
+ bool "1000 HZ"
+ help
+ 1000 Hz is the preferred choice for desktop systems and other
+ systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+ int
+ default 100 if HZ_100
+ default 250 if HZ_250
+ default 300 if HZ_300
+ default 1000 if HZ_1000
+
+config SCHED_HRTICK
+ def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 0000000..9fdba03
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,79 @@
+
+choice
+ prompt "Preemption Model"
+ default PREEMPT_NONE
+
+config PREEMPT_NONE
+ bool "No Forced Preemption (Server)"
+ help
+ This is the traditional Linux preemption model, geared towards
+ throughput. It will still provide good latencies most of the
+ time, but there are no guarantees and occasional longer delays
+ are possible.
+
+ Select this option if you are building a kernel for a server or
+ scientific/computation system, or if you want to maximize the
+ raw processing power of the kernel, irrespective of scheduling
+ latencies.
+
+config PREEMPT_VOLUNTARY
+ bool "Voluntary Kernel Preemption (Desktop)"
+ help
+ This option reduces the latency of the kernel by adding more
+ "explicit preemption points" to the kernel code. These new
+ preemption points have been selected to reduce the maximum
+ latency of rescheduling, providing faster application reactions,
+ at the cost of slightly lower throughput.
+
+ This allows reaction to interactive events by allowing a
+ low priority process to voluntarily preempt itself even if it
+ is in kernel mode executing a system call. This allows
+ applications to run more 'smoothly' even when the system is
+ under load.
+
+ Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+ bool "Preemptible Kernel (Low-Latency Desktop)"
+ help
+ This option reduces the latency of the kernel by making
+ all kernel code (that is not executing in a critical section)
+ preemptible. This allows reaction to interactive events by
+ permitting a low priority process to be preempted involuntarily
+ even if it is in kernel mode executing a system call and would
+ otherwise not be about to reach a natural preemption point.
+ This allows applications to run more 'smoothly' even when the
+ system is under load, at the cost of slightly lower throughput
+ and a slight runtime overhead to kernel code.
+
+ Select this if you are building a kernel for a desktop or
+ embedded system with latency requirements in the milliseconds
+ range.
+
+endchoice
+
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ default n
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ Say N if you are unsure.
+
+config RCU_TRACE
+ bool "Enable tracing for RCU - currently stats in debugfs"
+ depends on PREEMPT_RCU
+ select DEBUG_FS
+ default y
+ help
+ This option provides tracing in RCU which presents stats
+ in debugfs for debugging RCU implementation.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..19fad00
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,122 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
+ cpu.o exit.o itimer.o time.o softirq.o resource.o \
+ sysctl.o capability.o ptrace.o timer.o user.o \
+ signal.o sys.o kmod.o workqueue.o pid.o \
+ rcupdate.o extable.o params.o posix-timers.o \
+ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+ notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_sched.o = -pg
+endif
+
+obj-$(CONFIG_FREEZER) += freezer.o
+obj-$(CONFIG_PROFILING) += profile.o
+obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += time/
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
+obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_USER_NS) += user_namespace.o
+obj-$(CONFIG_PID_NS) += pid_namespace.o
+obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
+obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+ifeq ($(CONFIG_PREEMPT_RCU),y)
+obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+endif
+obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace/
+obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_SMP) += sched_cpupri.o
+
+ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+$(obj)/configs.o: $(obj)/config_data.h
+
+# config_data.h contains the same information as ikconfig.h but gzipped.
+# Info from config_data can be extracted from /proc/config*
+targets += config_data.gz
+$(obj)/config_data.gz: .config FORCE
+ $(call if_changed,gzip)
+
+quiet_cmd_ikconfiggz = IKCFG $@
+ cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+targets += config_data.h
+$(obj)/config_data.h: $(obj)/config_data.gz FORCE
+ $(call if_changed,ikconfiggz)
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_timeconst = TIMEC $@
+ cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+targets += timeconst.h
+$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+ $(call if_changed,timeconst)
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 0000000..8afbb31
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,676 @@
+/*
+ * linux/kernel/acct.c
+ *
+ * BSD Process Accounting for Linux
+ *
+ * Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Some code based on ideas and code from:
+ * Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ * This file implements BSD-style process accounting. Whenever any
+ * process exits, an accounting record of type "struct acct" is
+ * written to the file specified with the acct() system call. It is
+ * up to user-level programs to do useful things with the accounting
+ * log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ * Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ * the file happened to be read-only. 2) If the accounting was suspended
+ * due to the lack of space it happily allowed to reopen it and completely
+ * lost the old acct_file. 3/10/98, Al Viro.
+ *
+ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * was suspeneded we failed to stop it on umount(). Messy.
+ * Another one: remount to readonly didn't stop accounting.
+ * Question: what should we do if we have CAP_SYS_ADMIN but not
+ * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ * unless we are messing with the root. In that case we are getting a
+ * real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ * Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ * but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ * one race (and leak) in BSD implementation.
+ * OK, that's better. ANOTHER race and leak in BSD variant. There always
+ * is one more bug... 10/11/98, AV.
+ *
+ * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/vfs.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/blkdev.h> /* sector_div */
+#include <linux/pid_namespace.h>
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME (acct_parm[0]) /* >foo% free space - resume */
+#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
+#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct,
+ struct pid_namespace *ns, struct file *);
+
+/*
+ * This structure is used so that all the data protected by lock
+ * can be placed in the same cache line as the lock. This primes
+ * the cache line to have the data after getting the lock.
+ */
+struct bsd_acct_struct {
+ volatile int active;
+ volatile int needcheck;
+ struct file *file;
+ struct pid_namespace *ns;
+ struct timer_list timer;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(acct_lock);
+static LIST_HEAD(acct_list);
+
+/*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long x)
+{
+ struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+ acct->needcheck = 1;
+}
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+{
+ struct kstatfs sbuf;
+ int res;
+ int act;
+ sector_t resume;
+ sector_t suspend;
+
+ spin_lock(&acct_lock);
+ res = acct->active;
+ if (!file || !acct->needcheck)
+ goto out;
+ spin_unlock(&acct_lock);
+
+ /* May block */
+ if (vfs_statfs(file->f_path.dentry, &sbuf))
+ return res;
+ suspend = sbuf.f_blocks * SUSPEND;
+ resume = sbuf.f_blocks * RESUME;
+
+ sector_div(suspend, 100);
+ sector_div(resume, 100);
+
+ if (sbuf.f_bavail <= suspend)
+ act = -1;
+ else if (sbuf.f_bavail >= resume)
+ act = 1;
+ else
+ act = 0;
+
+ /*
+ * If some joker switched acct->file under us we'ld better be
+ * silent and _not_ touch anything.
+ */
+ spin_lock(&acct_lock);
+ if (file != acct->file) {
+ if (act)
+ res = act>0;
+ goto out;
+ }
+
+ if (acct->active) {
+ if (act < 0) {
+ acct->active = 0;
+ printk(KERN_INFO "Process accounting paused\n");
+ }
+ } else {
+ if (act > 0) {
+ acct->active = 1;
+ printk(KERN_INFO "Process accounting resumed\n");
+ }
+ }
+
+ del_timer(&acct->timer);
+ acct->needcheck = 0;
+ acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct->timer);
+ res = acct->active;
+out:
+ spin_unlock(&acct_lock);
+ return res;
+}
+
+/*
+ * Close the old accounting file (if currently open) and then replace
+ * it with file (if non-NULL).
+ *
+ * NOTE: acct_lock MUST be held on entry and exit.
+ */
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+ struct pid_namespace *ns)
+{
+ struct file *old_acct = NULL;
+ struct pid_namespace *old_ns = NULL;
+
+ if (acct->file) {
+ old_acct = acct->file;
+ old_ns = acct->ns;
+ del_timer(&acct->timer);
+ acct->active = 0;
+ acct->needcheck = 0;
+ acct->file = NULL;
+ acct->ns = NULL;
+ list_del(&acct->list);
+ }
+ if (file) {
+ acct->file = file;
+ acct->ns = ns;
+ acct->needcheck = 0;
+ acct->active = 1;
+ list_add(&acct->list, &acct_list);
+ /* It's been deleted if it was used before so this is safe */
+ setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
+ acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct->timer);
+ }
+ if (old_acct) {
+ mnt_unpin(old_acct->f_path.mnt);
+ spin_unlock(&acct_lock);
+ do_acct_process(acct, old_ns, old_acct);
+ filp_close(old_acct, NULL);
+ spin_lock(&acct_lock);
+ }
+}
+
+static int acct_on(char *name)
+{
+ struct file *file;
+ int error;
+ struct pid_namespace *ns;
+ struct bsd_acct_struct *acct = NULL;
+
+ /* Difference from BSD - they don't do O_APPEND */
+ file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ filp_close(file, NULL);
+ return -EACCES;
+ }
+
+ if (!file->f_op->write) {
+ filp_close(file, NULL);
+ return -EIO;
+ }
+
+ ns = task_active_pid_ns(current);
+ if (ns->bacct == NULL) {
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (acct == NULL) {
+ filp_close(file, NULL);
+ return -ENOMEM;
+ }
+ }
+
+ error = security_acct(file);
+ if (error) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return error;
+ }
+
+ spin_lock(&acct_lock);
+ if (ns->bacct == NULL) {
+ ns->bacct = acct;
+ acct = NULL;
+ }
+
+ mnt_pin(file->f_path.mnt);
+ acct_file_reopen(ns->bacct, file, ns);
+ spin_unlock(&acct_lock);
+
+ mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+ kfree(acct);
+
+ return 0;
+}
+
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
+ */
+SYSCALL_DEFINE1(acct, const char __user *, name)
+{
+ int error;
+
+ if (!capable(CAP_SYS_PACCT))
+ return -EPERM;
+
+ if (name) {
+ char *tmp = getname(name);
+ if (IS_ERR(tmp))
+ return (PTR_ERR(tmp));
+ error = acct_on(tmp);
+ putname(tmp);
+ } else {
+ struct bsd_acct_struct *acct;
+
+ acct = task_active_pid_ns(current)->bacct;
+ if (acct == NULL)
+ return 0;
+
+ error = security_acct(NULL);
+ if (!error) {
+ spin_lock(&acct_lock);
+ acct_file_reopen(acct, NULL, NULL);
+ spin_unlock(&acct_lock);
+ }
+ }
+ return error;
+}
+
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @m: vfsmount being shut down
+ *
+ * If the accounting is turned on for a file in the subtree pointed to
+ * to by m, turn accounting off. Done when m is about to die.
+ */
+void acct_auto_close_mnt(struct vfsmount *m)
+{
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+restart:
+ list_for_each_entry(acct, &acct_list, list)
+ if (acct->file && acct->file->f_path.mnt == m) {
+ acct_file_reopen(acct, NULL, NULL);
+ goto restart;
+ }
+ spin_unlock(&acct_lock);
+}
+
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
+ */
+void acct_auto_close(struct super_block *sb)
+{
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+restart:
+ list_for_each_entry(acct, &acct_list, list)
+ if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+ acct_file_reopen(acct, NULL, NULL);
+ goto restart;
+ }
+ spin_unlock(&acct_lock);
+}
+
+void acct_exit_ns(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+ acct = ns->bacct;
+ if (acct != NULL) {
+ if (acct->file != NULL)
+ acct_file_reopen(acct, NULL, NULL);
+
+ kfree(acct);
+ }
+ spin_unlock(&acct_lock);
+}
+
+/*
+ * encode an unsigned long into a comp_t
+ *
+ * This routine has been adopted from the encode_comp_t() function in
+ * the kern_acct.c file of the FreeBSD operating system. The encoding
+ * is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+ int exp, rnd;
+
+ exp = rnd = 0;
+ while (value > MAXFRACT) {
+ rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
+ value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT)) {
+ value >>= EXPSIZE;
+ exp++;
+ }
+
+ /*
+ * Clean it up and polish it off.
+ */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+}
+
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+/*
+ * encode an u64 into a comp2_t (24 bits)
+ *
+ * Format: 5 bit base 2 exponent, 20 bits mantissa.
+ * The leading bit of the mantissa is not stored, but implied for
+ * non-zero exponents.
+ * Largest encodable value is 50 bits.
+ */
+
+#define MANTSIZE2 20 /* 20 bit mantissa. */
+#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
+#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
+#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+
+static comp2_t encode_comp2_t(u64 value)
+{
+ int exp, rnd;
+
+ exp = (value > (MAXFRACT2>>1));
+ rnd = 0;
+ while (value > MAXFRACT2) {
+ rnd = value & 1;
+ value >>= 1;
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT2)) {
+ value >>= 1;
+ exp++;
+ }
+
+ if (exp > MAXEXP2) {
+ /* Overflow. Return largest representable number instead. */
+ return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
+ } else {
+ return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
+ }
+}
+#endif
+
+#if ACCT_VERSION==3
+/*
+ * encode an u64 into a 32 bit IEEE float
+ */
+static u32 encode_float(u64 value)
+{
+ unsigned exp = 190;
+ unsigned u;
+
+ if (value==0) return 0;
+ while ((s64)value > 0){
+ value <<= 1;
+ exp--;
+ }
+ u = (u32)(value >> 40) & 0x7fffffu;
+ return u | (exp << 23);
+}
+#endif
+
+/*
+ * Write an accounting entry for an exiting process
+ *
+ * The acct_process() call is the workhorse of the process
+ * accounting system. The struct acct is built here and then written
+ * into the accounting file. This function should only be called from
+ * do_exit() or when switching to a different output file.
+ */
+
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct,
+ struct pid_namespace *ns, struct file *file)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ acct_t ac;
+ mm_segment_t fs;
+ unsigned long flim;
+ u64 elapsed;
+ u64 run_time;
+ struct timespec uptime;
+ struct tty_struct *tty;
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(acct, file))
+ return;
+
+ /*
+ * Fill the accounting struct with the needed info as recorded
+ * by the different kernel functions.
+ */
+ memset((caddr_t)&ac, 0, sizeof(acct_t));
+
+ ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+
+ /* calculate run_time in nsec*/
+ do_posix_clock_monotonic_gettime(&uptime);
+ run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+ run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+ + current->group_leader->start_time.tv_nsec;
+ /* convert nsec -> AHZ */
+ elapsed = nsec_to_AHZ(run_time);
+#if ACCT_VERSION==3
+ ac.ac_etime = encode_float(elapsed);
+#else
+ ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+ {
+ /* new enlarged etime field */
+ comp2_t etime = encode_comp2_t(elapsed);
+ ac.ac_etime_hi = etime >> 16;
+ ac.ac_etime_lo = (u16) etime;
+ }
+#endif
+ do_div(elapsed, AHZ);
+ ac.ac_btime = get_seconds() - elapsed;
+ /* we really need to bite the bullet and change layout */
+ ac.ac_uid = current->uid;
+ ac.ac_gid = current->gid;
+#if ACCT_VERSION==2
+ ac.ac_ahz = AHZ;
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+ /* backward-compatible 16 bit fields */
+ ac.ac_uid16 = current->uid;
+ ac.ac_gid16 = current->gid;
+#endif
+#if ACCT_VERSION==3
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
+ rcu_read_unlock();
+#endif
+
+ spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty; /* Safe as we hold the siglock */
+ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac.ac_flag = pacct->ac_flag;
+ ac.ac_mem = encode_comp_t(pacct->ac_mem);
+ ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac.ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
+ ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
+ ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+ ac.ac_swaps = encode_comp_t(0);
+
+ /*
+ * Kernel segment override to datasegment and write it
+ * to the accounting file.
+ */
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ file->f_op->write(file, (char *)&ac,
+ sizeof(acct_t), &file->f_pos);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+ set_fs(fs);
+}
+
+/**
+ * acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+ memset(pacct, 0, sizeof(struct pacct_struct));
+ pacct->ac_utime = pacct->ac_stime = cputime_zero;
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ unsigned long vsize = 0;
+
+ if (group_dead && current->mm) {
+ struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up_read(&current->mm->mmap_sem);
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (group_dead)
+ pacct->ac_mem = vsize / 1024;
+ if (thread_group_leader(current)) {
+ pacct->ac_exitcode = exitcode;
+ if (current->flags & PF_FORKNOEXEC)
+ pacct->ac_flag |= AFORK;
+ }
+ if (current->flags & PF_SUPERPRIV)
+ pacct->ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ pacct->ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ pacct->ac_flag |= AXSIG;
+ pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+ pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ pacct->ac_minflt += current->min_flt;
+ pacct->ac_majflt += current->maj_flt;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void acct_process_in_ns(struct pid_namespace *ns)
+{
+ struct file *file = NULL;
+ struct bsd_acct_struct *acct;
+
+ acct = ns->bacct;
+ /*
+ * accelerate the common fastpath:
+ */
+ if (!acct || !acct->file)
+ return;
+
+ spin_lock(&acct_lock);
+ file = acct->file;
+ if (unlikely(!file)) {
+ spin_unlock(&acct_lock);
+ return;
+ }
+ get_file(file);
+ spin_unlock(&acct_lock);
+
+ do_acct_process(acct, ns, file);
+ fput(file);
+}
+
+/**
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+ struct pid_namespace *ns;
+
+ /*
+ * This loop is safe lockless, since current is still
+ * alive and holds its namespace, which in turn holds
+ * its parent.
+ */
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+ acct_process_in_ns(ns);
+}
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 0000000..ce6d8ea
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,1520 @@
+/* audit.c -- Auditing support
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with Security Modules.
+ * 2) Minimal run-time overhead:
+ * a) Minimal when syscall auditing is disabled (audit_enable=0).
+ * b) Small when syscall auditing is enabled and no audit record
+ * is generated (defer as much work as possible to record
+ * generation time):
+ * i) context is allocated,
+ * ii) names from getname are stored without a copy, and
+ * iii) inode information stored from path_lookup.
+ * 3) Ability to disable syscall auditing at boot time (audit=0).
+ * 4) Usable by other parts of the kernel (if audit_log* is called,
+ * then a syscall record will be generated automatically for the
+ * current syscall).
+ * 5) Netlink interface to user-space.
+ * 6) Support low-overhead kernel-based filtering to minimize the
+ * information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/inotify.h>
+#include <linux/freezer.h>
+#include <linux/tty.h>
+
+#include "audit.h"
+
+/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
+ * (Initialization happens after skb_init is called.) */
+#define AUDIT_DISABLED -1
+#define AUDIT_UNINITIALIZED 0
+#define AUDIT_INITIALIZED 1
+static int audit_initialized;
+
+#define AUDIT_OFF 0
+#define AUDIT_ON 1
+#define AUDIT_LOCKED 2
+int audit_enabled;
+int audit_ever_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int audit_failure = AUDIT_FAIL_PRINTK;
+
+/*
+ * If audit records are to be written to the netlink socket, audit_pid
+ * contains the pid of the auditd process and audit_nlk_pid contains
+ * the pid to use to send netlink messages to that process.
+ */
+int audit_pid;
+static int audit_nlk_pid;
+
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
+ * to that number per second. This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int audit_backlog_limit = 64;
+static int audit_backlog_wait_time = 60 * HZ;
+static int audit_backlog_wait_overflow = 0;
+
+/* The identity of the user shutting down the audit system. */
+uid_t audit_sig_uid = -1;
+pid_t audit_sig_pid = -1;
+u32 audit_sig_sid = 0;
+
+/* Records can be lost in several ways:
+ 0) [suppressed in audit_alloc]
+ 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+ 2) out of memory in audit_log_move [alloc_skb]
+ 3) suppressed due to audit_rate_limit
+ 4) suppressed due to audit_backlog_limit
+*/
+static atomic_t audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/* Hash for inode-based rules */
+struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static DEFINE_SPINLOCK(audit_freelist_lock);
+static int audit_freelist_count;
+static LIST_HEAD(audit_freelist);
+
+static struct sk_buff_head audit_skb_queue;
+/* queue of skbs to send to auditd when/if it comes back */
+static struct sk_buff_head audit_skb_hold_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+
+/* Serialize requests from userspace. */
+static DEFINE_MUTEX(audit_cmd_mutex);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records. Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record. The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue. Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+ struct list_head list;
+ struct sk_buff *skb; /* formatted skb ready to send */
+ struct audit_context *ctx; /* NULL or associated context */
+ gfp_t gfp_mask;
+};
+
+struct audit_reply {
+ int pid;
+ struct sk_buff *skb;
+};
+
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+{
+ if (ab) {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+ nlh->nlmsg_pid = pid;
+ }
+}
+
+void audit_panic(const char *message)
+{
+ switch (audit_failure)
+ {
+ case AUDIT_FAIL_SILENT:
+ break;
+ case AUDIT_FAIL_PRINTK:
+ if (printk_ratelimit())
+ printk(KERN_ERR "audit: %s\n", message);
+ break;
+ case AUDIT_FAIL_PANIC:
+ /* test audit_pid since printk is always losey, why bother? */
+ if (audit_pid)
+ panic("audit: %s\n", message);
+ break;
+ }
+}
+
+static inline int audit_rate_check(void)
+{
+ static unsigned long last_check = 0;
+ static int messages = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ unsigned long elapsed;
+ int retval = 0;
+
+ if (!audit_rate_limit) return 1;
+
+ spin_lock_irqsave(&lock, flags);
+ if (++messages < audit_rate_limit) {
+ retval = 1;
+ } else {
+ now = jiffies;
+ elapsed = now - last_check;
+ if (elapsed > HZ) {
+ last_check = now;
+ messages = 0;
+ retval = 1;
+ }
+ }
+ spin_unlock_irqrestore(&lock, flags);
+
+ return retval;
+}
+
+/**
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
+void audit_log_lost(const char *message)
+{
+ static unsigned long last_msg = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ int print;
+
+ atomic_inc(&audit_lost);
+
+ print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+ if (!print) {
+ spin_lock_irqsave(&lock, flags);
+ now = jiffies;
+ if (now - last_msg > HZ) {
+ print = 1;
+ last_msg = now;
+ }
+ spin_unlock_irqrestore(&lock, flags);
+ }
+
+ if (print) {
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "audit: audit_lost=%d audit_rate_limit=%d "
+ "audit_backlog_limit=%d\n",
+ atomic_read(&audit_lost),
+ audit_rate_limit,
+ audit_backlog_limit);
+ audit_panic(message);
+ }
+}
+
+static int audit_log_config_change(char *function_name, int new, int old,
+ uid_t loginuid, u32 sessionid, u32 sid,
+ int allow_changes)
+{
+ struct audit_buffer *ab;
+ int rc = 0;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
+ old, loginuid, sessionid);
+ if (sid) {
+ char *ctx = NULL;
+ u32 len;
+
+ rc = security_secid_to_secctx(sid, &ctx, &len);
+ if (rc) {
+ audit_log_format(ab, " sid=%u", sid);
+ allow_changes = 0; /* Something weird, deny request */
+ } else {
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ audit_log_format(ab, " res=%d", allow_changes);
+ audit_log_end(ab);
+ return rc;
+}
+
+static int audit_do_config_change(char *function_name, int *to_change,
+ int new, uid_t loginuid, u32 sessionid,
+ u32 sid)
+{
+ int allow_changes, rc = 0, old = *to_change;
+
+ /* check if we are locked */
+ if (audit_enabled == AUDIT_LOCKED)
+ allow_changes = 0;
+ else
+ allow_changes = 1;
+
+ if (audit_enabled != AUDIT_OFF) {
+ rc = audit_log_config_change(function_name, new, old, loginuid,
+ sessionid, sid, allow_changes);
+ if (rc)
+ allow_changes = 0;
+ }
+
+ /* If we are allowed, make the change */
+ if (allow_changes == 1)
+ *to_change = new;
+ /* Not allowed, update reason */
+ else if (rc == 0)
+ rc = -EPERM;
+ return rc;
+}
+
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
+ u32 sid)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
+ limit, loginuid, sessionid, sid);
+}
+
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
+ u32 sid)
+{
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
+ limit, loginuid, sessionid, sid);
+}
+
+static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+{
+ int rc;
+ if (state < AUDIT_OFF || state > AUDIT_LOCKED)
+ return -EINVAL;
+
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
+ loginuid, sessionid, sid);
+
+ if (!rc)
+ audit_ever_enabled |= !!state;
+
+ return rc;
+}
+
+static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+{
+ if (state != AUDIT_FAIL_SILENT
+ && state != AUDIT_FAIL_PRINTK
+ && state != AUDIT_FAIL_PANIC)
+ return -EINVAL;
+
+ return audit_do_config_change("audit_failure", &audit_failure, state,
+ loginuid, sessionid, sid);
+}
+
+/*
+ * Queue skbs to be sent to auditd when/if it comes back. These skbs should
+ * already have been sent via prink/syslog and so if these messages are dropped
+ * it is not a huge concern since we already passed the audit_log_lost()
+ * notification and stuff. This is just nice to get audit messages during
+ * boot before auditd is running or messages generated while auditd is stopped.
+ * This only holds messages is audit_default is set, aka booting with audit=1
+ * or building your kernel that way.
+ */
+static void audit_hold_skb(struct sk_buff *skb)
+{
+ if (audit_default &&
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+ skb_queue_tail(&audit_skb_hold_queue, skb);
+ else
+ kfree_skb(skb);
+}
+
+static void kauditd_send_skb(struct sk_buff *skb)
+{
+ int err;
+ /* take a reference in case we can't send it and we want to hold it */
+ skb_get(skb);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ if (err < 0) {
+ BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd dissapeared\n");
+ audit_pid = 0;
+ /* we might get lucky and get this in the next auditd */
+ audit_hold_skb(skb);
+ } else
+ /* drop the extra reference if sent ok */
+ kfree_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
+ struct sk_buff *skb;
+
+ set_freezable();
+ while (!kthread_should_stop()) {
+ /*
+ * if auditd just started drain the queue of messages already
+ * sent to syslog/printk. remember loss here is ok. we already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * if you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+ if (audit_default && audit_pid) {
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (unlikely(skb)) {
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+ }
+ }
+
+ skb = skb_dequeue(&audit_skb_queue);
+ wake_up(&audit_backlog_wait);
+ if (skb) {
+ if (audit_pid)
+ kauditd_send_skb(skb);
+ else {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
+ else
+ audit_log_lost("printk limit exceeded\n");
+
+ audit_hold_skb(skb);
+ }
+ } else {
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
+
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
+ }
+ }
+ return 0;
+}
+
+static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
+{
+ struct task_struct *tsk;
+ int err;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ err = -ESRCH;
+ if (!tsk)
+ goto out;
+ err = 0;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!tsk->signal->audit_tty)
+ err = -EPERM;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (err)
+ goto out;
+
+ tty_audit_push_task(tsk, loginuid, sessionid);
+out:
+ read_unlock(&tasklist_lock);
+ return err;
+}
+
+int audit_send_list(void *_dest)
+{
+ struct audit_netlink_list *dest = _dest;
+ int pid = dest->pid;
+ struct sk_buff *skb;
+
+ /* wait for parent to finish and send an ACK */
+ mutex_lock(&audit_cmd_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+
+ while ((skb = __skb_dequeue(&dest->q)) != NULL)
+ netlink_unicast(audit_sock, skb, pid, 0);
+
+ kfree(dest);
+
+ return 0;
+}
+
+#ifdef CONFIG_AUDIT_TREE
+static int prune_tree_thread(void *unused)
+{
+ mutex_lock(&audit_cmd_mutex);
+ audit_prune_trees();
+ mutex_unlock(&audit_cmd_mutex);
+ return 0;
+}
+
+void audit_schedule_prune(void)
+{
+ kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+#endif
+
+struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+ int multi, void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int len = NLMSG_SPACE(size);
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ nlh = NLMSG_PUT(skb, pid, seq, t, size);
+ nlh->nlmsg_flags = flags;
+ data = NLMSG_DATA(nlh);
+ memcpy(data, payload, size);
+ return skb;
+
+nlmsg_failure: /* Used by NLMSG_PUT */
+ if (skb)
+ kfree_skb(skb);
+ return NULL;
+}
+
+static int audit_send_reply_thread(void *arg)
+{
+ struct audit_reply *reply = (struct audit_reply *)arg;
+
+ mutex_lock(&audit_cmd_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+
+ /* Ignore failure. It'll only happen if the sender goes away,
+ because our timeout is set to infinite. */
+ netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+ kfree(reply);
+ return 0;
+}
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @pid: process id to send reply to
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * No failure notifications.
+ */
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+ void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct task_struct *tsk;
+ struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
+ GFP_KERNEL);
+
+ if (!reply)
+ return;
+
+ skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
+ if (!skb)
+ goto out;
+
+ reply->pid = pid;
+ reply->skb = skb;
+
+ tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
+ if (!IS_ERR(tsk))
+ return;
+ kfree_skb(skb);
+out:
+ kfree(reply);
+}
+
+/*
+ * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
+ * control messages.
+ */
+static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
+{
+ int err = 0;
+
+ switch (msg_type) {
+ case AUDIT_GET:
+ case AUDIT_LIST:
+ case AUDIT_LIST_RULES:
+ case AUDIT_SET:
+ case AUDIT_ADD:
+ case AUDIT_ADD_RULE:
+ case AUDIT_DEL:
+ case AUDIT_DEL_RULE:
+ case AUDIT_SIGNAL_INFO:
+ case AUDIT_TTY_GET:
+ case AUDIT_TTY_SET:
+ case AUDIT_TRIM:
+ case AUDIT_MAKE_EQUIV:
+ if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+ err = -EPERM;
+ break;
+ case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
+ if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+ err = -EPERM;
+ break;
+ default: /* bad msg */
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
+ u32 pid, u32 uid, uid_t auid, u32 ses,
+ u32 sid)
+{
+ int rc = 0;
+ char *ctx = NULL;
+ u32 len;
+
+ if (!audit_enabled) {
+ *ab = NULL;
+ return rc;
+ }
+
+ *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+ pid, uid, auid, ses);
+ if (sid) {
+ rc = security_secid_to_secctx(sid, &ctx, &len);
+ if (rc)
+ audit_log_format(*ab, " ssid=%u", sid);
+ else {
+ audit_log_format(*ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ return rc;
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ u32 uid, pid, seq, sid;
+ void *data;
+ struct audit_status *status_get, status_set;
+ int err;
+ struct audit_buffer *ab;
+ u16 msg_type = nlh->nlmsg_type;
+ uid_t loginuid; /* loginuid of sender */
+ u32 sessionid;
+ struct audit_sig_info *sig_data;
+ char *ctx = NULL;
+ u32 len;
+
+ err = audit_netlink_ok(skb, msg_type);
+ if (err)
+ return err;
+
+ /* As soon as there's any sign of userspace auditd,
+ * start kauditd to talk to it */
+ if (!kauditd_task)
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
+
+ pid = NETLINK_CREDS(skb)->pid;
+ uid = NETLINK_CREDS(skb)->uid;
+ loginuid = NETLINK_CB(skb).loginuid;
+ sessionid = NETLINK_CB(skb).sessionid;
+ sid = NETLINK_CB(skb).sid;
+ seq = nlh->nlmsg_seq;
+ data = NLMSG_DATA(nlh);
+
+ switch (msg_type) {
+ case AUDIT_GET:
+ status_set.enabled = audit_enabled;
+ status_set.failure = audit_failure;
+ status_set.pid = audit_pid;
+ status_set.rate_limit = audit_rate_limit;
+ status_set.backlog_limit = audit_backlog_limit;
+ status_set.lost = atomic_read(&audit_lost);
+ status_set.backlog = skb_queue_len(&audit_skb_queue);
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+ &status_set, sizeof(status_set));
+ break;
+ case AUDIT_SET:
+ if (nlh->nlmsg_len < sizeof(struct audit_status))
+ return -EINVAL;
+ status_get = (struct audit_status *)data;
+ if (status_get->mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(status_get->enabled,
+ loginuid, sessionid, sid);
+ if (err < 0)
+ return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(status_get->failure,
+ loginuid, sessionid, sid);
+ if (err < 0)
+ return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_PID) {
+ int new_pid = status_get->pid;
+
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid", new_pid,
+ audit_pid, loginuid,
+ sessionid, sid, 1);
+
+ audit_pid = new_pid;
+ audit_nlk_pid = NETLINK_CB(skb).pid;
+ }
+ if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(status_get->rate_limit,
+ loginuid, sessionid, sid);
+ if (err < 0)
+ return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+ err = audit_set_backlog_limit(status_get->backlog_limit,
+ loginuid, sessionid, sid);
+ break;
+ case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC)
+ return 0;
+
+ err = audit_filter_user(&NETLINK_CB(skb));
+ if (err == 1) {
+ err = 0;
+ if (msg_type == AUDIT_USER_TTY) {
+ err = audit_prepare_user_tty(pid, loginuid,
+ sessionid);
+ if (err)
+ break;
+ }
+ audit_log_common_recv_msg(&ab, msg_type, pid, uid,
+ loginuid, sessionid, sid);
+
+ if (msg_type != AUDIT_USER_TTY)
+ audit_log_format(ab, " msg='%.1024s'",
+ (char *)data);
+ else {
+ int size;
+
+ audit_log_format(ab, " msg=");
+ size = nlmsg_len(nlh);
+ audit_log_n_untrustedstring(ab, data, size);
+ }
+ audit_set_pid(ab, pid);
+ audit_log_end(ab);
+ }
+ break;
+ case AUDIT_ADD:
+ case AUDIT_DEL:
+ if (nlmsg_len(nlh) < sizeof(struct audit_rule))
+ return -EINVAL;
+ if (audit_enabled == AUDIT_LOCKED) {
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+ uid, loginuid, sessionid, sid);
+
+ audit_log_format(ab, " audit_enabled=%d res=0",
+ audit_enabled);
+ audit_log_end(ab);
+ return -EPERM;
+ }
+ /* fallthrough */
+ case AUDIT_LIST:
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
+ uid, seq, data, nlmsg_len(nlh),
+ loginuid, sessionid, sid);
+ break;
+ case AUDIT_ADD_RULE:
+ case AUDIT_DEL_RULE:
+ if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ return -EINVAL;
+ if (audit_enabled == AUDIT_LOCKED) {
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+ uid, loginuid, sessionid, sid);
+
+ audit_log_format(ab, " audit_enabled=%d res=0",
+ audit_enabled);
+ audit_log_end(ab);
+ return -EPERM;
+ }
+ /* fallthrough */
+ case AUDIT_LIST_RULES:
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
+ uid, seq, data, nlmsg_len(nlh),
+ loginuid, sessionid, sid);
+ break;
+ case AUDIT_TRIM:
+ audit_trim_trees();
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+ uid, loginuid, sessionid, sid);
+
+ audit_log_format(ab, " op=trim res=1");
+ audit_log_end(ab);
+ break;
+ case AUDIT_MAKE_EQUIV: {
+ void *bufp = data;
+ u32 sizes[2];
+ size_t msglen = nlmsg_len(nlh);
+ char *old, *new;
+
+ err = -EINVAL;
+ if (msglen < 2 * sizeof(u32))
+ break;
+ memcpy(sizes, bufp, 2 * sizeof(u32));
+ bufp += 2 * sizeof(u32);
+ msglen -= 2 * sizeof(u32);
+ old = audit_unpack_string(&bufp, &msglen, sizes[0]);
+ if (IS_ERR(old)) {
+ err = PTR_ERR(old);
+ break;
+ }
+ new = audit_unpack_string(&bufp, &msglen, sizes[1]);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ kfree(old);
+ break;
+ }
+ /* OK, here comes... */
+ err = audit_tag_tree(old, new);
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
+ uid, loginuid, sessionid, sid);
+
+ audit_log_format(ab, " op=make_equiv old=");
+ audit_log_untrustedstring(ab, old);
+ audit_log_format(ab, " new=");
+ audit_log_untrustedstring(ab, new);
+ audit_log_format(ab, " res=%d", !err);
+ audit_log_end(ab);
+ kfree(old);
+ kfree(new);
+ break;
+ }
+ case AUDIT_SIGNAL_INFO:
+ err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+ if (err)
+ return err;
+ sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ if (!sig_data) {
+ security_release_secctx(ctx, len);
+ return -ENOMEM;
+ }
+ sig_data->uid = audit_sig_uid;
+ sig_data->pid = audit_sig_pid;
+ memcpy(sig_data->ctx, ctx, len);
+ security_release_secctx(ctx, len);
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
+ 0, 0, sig_data, sizeof(*sig_data) + len);
+ kfree(sig_data);
+ break;
+ case AUDIT_TTY_GET: {
+ struct audit_tty_status s;
+ struct task_struct *tsk;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (!tsk)
+ err = -ESRCH;
+ else {
+ spin_lock_irq(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty != 0;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
+ &s, sizeof(s));
+ break;
+ }
+ case AUDIT_TTY_SET: {
+ struct audit_tty_status *s;
+ struct task_struct *tsk;
+
+ if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
+ return -EINVAL;
+ s = data;
+ if (s->enabled != 0 && s->enabled != 1)
+ return -EINVAL;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (!tsk)
+ err = -ESRCH;
+ else {
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->signal->audit_tty = s->enabled != 0;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err < 0 ? err : 0;
+}
+
+/*
+ * Get message from skb (based on rtnetlink_rcv_skb). Each message is
+ * processed by audit_receive_msg. Malformed skbs with wrong length are
+ * discarded silently.
+ */
+static void audit_receive_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+ u32 rlen;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ nlh = nlmsg_hdr(skb);
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if ((err = audit_receive_msg(skb, nlh))) {
+ netlink_ack(skb, nlh, err);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sk_buff *skb)
+{
+ mutex_lock(&audit_cmd_mutex);
+ audit_receive_skb(skb);
+ mutex_unlock(&audit_cmd_mutex);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static const struct inotify_operations audit_inotify_ops = {
+ .handle_event = audit_handle_ievent,
+ .destroy_watch = audit_free_parent,
+};
+#endif
+
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+ int i;
+
+ if (audit_initialized == AUDIT_DISABLED)
+ return 0;
+
+ printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
+ audit_receive, NULL, THIS_MODULE);
+ if (!audit_sock)
+ audit_panic("cannot initialize netlink socket");
+ else
+ audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ skb_queue_head_init(&audit_skb_queue);
+ skb_queue_head_init(&audit_skb_hold_queue);
+ audit_initialized = AUDIT_INITIALIZED;
+ audit_enabled = audit_default;
+ audit_ever_enabled |= !!audit_default;
+
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
+#ifdef CONFIG_AUDITSYSCALL
+ audit_ih = inotify_init(&audit_inotify_ops);
+ if (IS_ERR(audit_ih))
+ audit_panic("cannot initialize inotify handle");
+#endif
+
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+ INIT_LIST_HEAD(&audit_inode_hash[i]);
+
+ return 0;
+}
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+ audit_default = !!simple_strtol(str, NULL, 0);
+ if (!audit_default)
+ audit_initialized = AUDIT_DISABLED;
+
+ printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+
+ if (audit_initialized == AUDIT_INITIALIZED) {
+ audit_enabled = audit_default;
+ audit_ever_enabled |= !!audit_default;
+ } else if (audit_initialized == AUDIT_UNINITIALIZED) {
+ printk(" (after initialization)");
+ } else {
+ printk(" (until reboot)");
+ }
+ printk("\n");
+
+ return 1;
+}
+
+__setup("audit=", audit_enable);
+
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+
+ if (ab->skb)
+ kfree_skb(ab->skb);
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else {
+ audit_freelist_count++;
+ list_add(&ab->list, &audit_freelist);
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+ gfp_t gfp_mask, int type)
+{
+ unsigned long flags;
+ struct audit_buffer *ab = NULL;
+ struct nlmsghdr *nlh;
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab) {
+ ab = kmalloc(sizeof(*ab), gfp_mask);
+ if (!ab)
+ goto err;
+ }
+
+ ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto err;
+
+ ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
+ nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+ nlh->nlmsg_type = type;
+ nlh->nlmsg_flags = 0;
+ nlh->nlmsg_pid = 0;
+ nlh->nlmsg_seq = 0;
+ return ab;
+err:
+ audit_buffer_free(ab);
+ return NULL;
+}
+
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts).
+ */
+unsigned int audit_serial(void)
+{
+ static DEFINE_SPINLOCK(serial_lock);
+ static unsigned int serial = 0;
+
+ unsigned long flags;
+ unsigned int ret;
+
+ spin_lock_irqsave(&serial_lock, flags);
+ do {
+ ret = ++serial;
+ } while (unlikely(!ret));
+ spin_unlock_irqrestore(&serial_lock, flags);
+
+ return ret;
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
+ *t = CURRENT_TIME;
+ *serial = audit_serial();
+ }
+}
+
+/* Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, tsk
+ * should be NULL. */
+
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
+ int type)
+{
+ struct audit_buffer *ab = NULL;
+ struct timespec t;
+ unsigned int uninitialized_var(serial);
+ int reserve;
+ unsigned long timeout_start = jiffies;
+
+ if (audit_initialized != AUDIT_INITIALIZED)
+ return NULL;
+
+ if (unlikely(audit_filter_type(type)))
+ return NULL;
+
+ if (gfp_mask & __GFP_WAIT)
+ reserve = 0;
+ else
+ reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
+
+ while (audit_backlog_limit
+ && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+ && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+
+ /* Wait for auditd to drain the queue a little */
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ continue;
+ }
+ if (audit_rate_check() && printk_ratelimit())
+ printk(KERN_WARNING
+ "audit: audit_backlog=%d > "
+ "audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ audit_backlog_wait_time = audit_backlog_wait_overflow;
+ wake_up(&audit_backlog_wait);
+ return NULL;
+ }
+
+ ab = audit_buffer_alloc(ctx, gfp_mask, type);
+ if (!ab) {
+ audit_log_lost("out of memory in audit_log_start");
+ return NULL;
+ }
+
+ audit_get_stamp(ab->ctx, &t, &serial);
+
+ audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+ t.tv_sec, t.tv_nsec/1000000, serial);
+ return ab;
+}
+
+/**
+ * audit_expand - expand skb in the audit buffer
+ * @ab: audit_buffer
+ * @extra: space to add at tail of the skb
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab, int extra)
+{
+ struct sk_buff *skb = ab->skb;
+ int oldtail = skb_tailroom(skb);
+ int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
+ int newtail = skb_tailroom(skb);
+
+ if (ret < 0) {
+ audit_log_lost("out of memory in audit_expand");
+ return 0;
+ }
+
+ skb->truesize += newtail - oldtail;
+ return newtail;
+}
+
+/*
+ * Format an audit message into the audit buffer. If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time. Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+ va_list args)
+{
+ int len, avail;
+ struct sk_buff *skb;
+ va_list args2;
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ if (avail == 0) {
+ avail = audit_expand(ab, AUDIT_BUFSIZ);
+ if (!avail)
+ goto out;
+ }
+ va_copy(args2, args);
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
+ if (len >= avail) {
+ /* The printk buffer is 1024 bytes long, so if we get
+ * here and AUDIT_BUFSIZ is at least 1024, then we can
+ * log everything that printk could have logged. */
+ avail = audit_expand(ab,
+ max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ if (!avail)
+ goto out;
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
+ }
+ va_end(args2);
+ if (len > 0)
+ skb_put(skb, len);
+out:
+ return;
+}
+
+/**
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!ab)
+ return;
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+}
+
+/**
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
+ size_t len)
+{
+ int i, avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+ static const unsigned char *hex = "0123456789ABCDEF";
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = len<<1;
+ if (new_len >= avail) {
+ /* Round the buffer request up to the next multiple */
+ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
+
+ ptr = skb_tail_pointer(skb);
+ for (i=0; i<len; i++) {
+ *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+ *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
+ }
+ *ptr = 0;
+ skb_put(skb, len << 1); /* new string is twice the old string */
+}
+
+/*
+ * Format a string of no more than slen characters into the audit buffer,
+ * enclosed in quote marks.
+ */
+void audit_log_n_string(struct audit_buffer *ab, const char *string,
+ size_t slen)
+{
+ int avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = slen + 3; /* enclosing quotes + null terminator */
+ if (new_len > avail) {
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
+ ptr = skb_tail_pointer(skb);
+ *ptr++ = '"';
+ memcpy(ptr, string, slen);
+ ptr += slen;
+ *ptr++ = '"';
+ *ptr = 0;
+ skb_put(skb, slen + 2); /* don't include null terminator */
+}
+
+/**
+ * audit_string_contains_control - does a string need to be logged in hex
+ * @string: string to be checked
+ * @len: max length of the string to check
+ */
+int audit_string_contains_control(const char *string, size_t len)
+{
+ const unsigned char *p;
+ for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+ if (*p == '"' || *p < 0x21 || *p > 0x7e)
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * audit_log_n_untrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @len: length of string (not including trailing null)
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char).
+ *
+ * The caller specifies the number of characters in the string to log, which may
+ * or may not be the entire string.
+ */
+void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
+ size_t len)
+{
+ if (audit_string_contains_control(string, len))
+ audit_log_n_hex(ab, string, len);
+ else
+ audit_log_n_string(ab, string, len);
+}
+
+/**
+ * audit_log_untrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * Same as audit_log_n_untrustedstring(), except that strlen is used to
+ * determine string length.
+ */
+void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+ audit_log_n_untrustedstring(ab, string, strlen(string));
+}
+
+/* This is a helper-function to print the escaped d_path */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+ struct path *path)
+{
+ char *p, *pathname;
+
+ if (prefix)
+ audit_log_format(ab, " %s", prefix);
+
+ /* We will allow 11 spaces for ' (deleted)' to be appended */
+ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
+ if (!pathname) {
+ audit_log_format(ab, "<no memory>");
+ return;
+ }
+ p = d_path(path, pathname, PATH_MAX+11);
+ if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+ /* FIXME: can we save some information here? */
+ audit_log_format(ab, "<too long>");
+ } else
+ audit_log_untrustedstring(ab, p);
+ kfree(pathname);
+}
+
+/**
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is placed on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context. May be called in
+ * any context.
+ */
+void audit_log_end(struct audit_buffer *ab)
+{
+ if (!ab)
+ return;
+ if (!audit_rate_check()) {
+ audit_log_lost("rate limit exceeded");
+ } else {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+ nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
+ if (audit_pid) {
+ skb_queue_tail(&audit_skb_queue, ab->skb);
+ wake_up_interruptible(&kauditd_wait);
+ } else {
+ if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit()) {
+ printk(KERN_NOTICE "type=%d %s\n",
+ nlh->nlmsg_type,
+ ab->skb->data + NLMSG_SPACE(0));
+ } else
+ audit_log_lost("printk limit exceeded\n");
+ }
+ audit_hold_skb(ab->skb);
+ }
+ ab->skb = NULL;
+ }
+ audit_buffer_free(ab);
+}
+
+/**
+ * audit_log - Log an audit record
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end. It may be called
+ * in any context.
+ */
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
+ const char *fmt, ...)
+{
+ struct audit_buffer *ab;
+ va_list args;
+
+ ab = audit_log_start(ctx, gfp_mask, type);
+ if (ab) {
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+ audit_log_end(ab);
+ }
+}
+
+EXPORT_SYMBOL(audit_log_start);
+EXPORT_SYMBOL(audit_log_end);
+EXPORT_SYMBOL(audit_log_format);
+EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 0000000..9d67174
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,169 @@
+/* audit -- definition of audit_context structure and supporting types
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+#include <linux/skbuff.h>
+
+/* 0 = no checking
+ 1 = put_count checking
+ 2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* At task start time, the audit_state is set in the audit_context using
+ a per-task filter. At syscall entry, the audit_state is augmented by
+ the syscall filter. */
+enum audit_state {
+ AUDIT_DISABLED, /* Do not create per-task audit_context.
+ * No syscall-specific audit records can
+ * be generated. */
+ AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
+ * but don't necessarily fill it in at
+ * syscall entry time (i.e., filter
+ * instead). */
+ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ * and always fill it in at syscall
+ * entry time. This makes a full
+ * syscall record available if some
+ * other part of the kernel decides it
+ * should be recorded. */
+ AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ * always fill it in at syscall entry
+ * time, and always write out the audit
+ * record at syscall exit time. */
+};
+
+/* Rule lists */
+struct audit_parent;
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ char *path; /* insertion path */
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* associated rules */
+};
+
+struct audit_tree;
+struct audit_chunk;
+
+struct audit_entry {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct audit_krule rule;
+};
+
+#ifdef CONFIG_AUDIT
+extern int audit_enabled;
+extern int audit_ever_enabled;
+#endif
+
+extern int audit_pid;
+
+#define AUDIT_INODE_BUCKETS 32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+static inline int audit_hash_ino(u32 ino)
+{
+ return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+
+extern int audit_match_class(int class, unsigned syscall);
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_compare_dname_path(const char *dname, const char *path,
+ int *dirlen);
+extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
+ int done, int multi,
+ void *payload, int size);
+extern void audit_send_reply(int pid, int seq, int type,
+ int done, int multi,
+ void *payload, int size);
+extern void audit_panic(const char *message);
+
+struct audit_netlink_list {
+ int pid;
+ struct sk_buff_head q;
+};
+
+int audit_send_list(void *);
+
+struct inotify_watch;
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
+
+extern void audit_free_parent(struct inotify_watch *);
+extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
+ const char *, struct inode *);
+extern int selinux_audit_rule_update(void);
+
+extern struct mutex audit_filter_mutex;
+extern void audit_free_rule_rcu(struct rcu_head *);
+extern struct list_head audit_filter_list[];
+
+#ifdef CONFIG_AUDIT_TREE
+extern struct audit_chunk *audit_tree_lookup(const struct inode *);
+extern void audit_put_chunk(struct audit_chunk *);
+extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
+extern int audit_make_tree(struct audit_krule *, char *, u32);
+extern int audit_add_tree_rule(struct audit_krule *);
+extern int audit_remove_tree_rule(struct audit_krule *);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern void audit_schedule_prune(void);
+extern void audit_prune_trees(void);
+extern const char *audit_tree_path(struct audit_tree *);
+extern void audit_put_tree(struct audit_tree *);
+#else
+#define audit_remove_tree_rule(rule) BUG()
+#define audit_add_tree_rule(rule) -EINVAL
+#define audit_make_tree(rule, str, op) -EINVAL
+#define audit_trim_trees() (void)0
+#define audit_put_tree(tree) (void)0
+#define audit_tag_tree(old, new) -EINVAL
+#define audit_tree_path(rule) "" /* never called */
+#endif
+
+extern char *audit_unpack_string(void **, size_t *, size_t);
+
+extern pid_t audit_sig_pid;
+extern uid_t audit_sig_uid;
+extern u32 audit_sig_sid;
+
+#ifdef CONFIG_AUDITSYSCALL
+extern int __audit_signal_info(int sig, struct task_struct *t);
+static inline int audit_signal_info(int sig, struct task_struct *t)
+{
+ if (unlikely((audit_pid && t->tgid == audit_pid) ||
+ (audit_signals && !audit_dummy_context())))
+ return __audit_signal_info(sig, t);
+ return 0;
+}
+extern enum audit_state audit_filter_inodes(struct task_struct *,
+ struct audit_context *);
+extern void audit_set_auditable(struct audit_context *);
+#else
+#define audit_signal_info(s,t) AUDIT_DISABLED
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#define audit_set_auditable(c)
+#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
new file mode 100644
index 0000000..8b50944
--- /dev/null
+++ b/kernel/audit_tree.c
@@ -0,0 +1,919 @@
+#include "audit.h"
+#include <linux/inotify.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+
+struct audit_tree;
+struct audit_chunk;
+
+struct audit_tree {
+ atomic_t count;
+ int goner;
+ struct audit_chunk *root;
+ struct list_head chunks;
+ struct list_head rules;
+ struct list_head list;
+ struct list_head same_root;
+ struct rcu_head head;
+ char pathname[];
+};
+
+struct audit_chunk {
+ struct list_head hash;
+ struct inotify_watch watch;
+ struct list_head trees; /* with root here */
+ int dead;
+ int count;
+ atomic_long_t refs;
+ struct rcu_head head;
+ struct node {
+ struct list_head list;
+ struct audit_tree *owner;
+ unsigned index; /* index; upper bit indicates 'will prune' */
+ } owners[];
+};
+
+static LIST_HEAD(tree_list);
+static LIST_HEAD(prune_list);
+
+/*
+ * One struct chunk is attached to each inode of interest.
+ * We replace struct chunk on tagging/untagging.
+ * Rules have pointer to struct audit_tree.
+ * Rules have struct list_head rlist forming a list of rules over
+ * the same tree.
+ * References to struct chunk are collected at audit_inode{,_child}()
+ * time and used in AUDIT_TREE rule matching.
+ * These references are dropped at the same time we are calling
+ * audit_free_names(), etc.
+ *
+ * Cyclic lists galore:
+ * tree.chunks anchors chunk.owners[].list hash_lock
+ * tree.rules anchors rule.rlist audit_filter_mutex
+ * chunk.trees anchors tree.same_root hash_lock
+ * chunk.hash is a hash with middle bits of watch.inode as
+ * a hash function. RCU, hash_lock
+ *
+ * tree is refcounted; one reference for "some rules on rules_list refer to
+ * it", one for each chunk with pointer to it.
+ *
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
+ *
+ * node.index allows to get from node.list to containing chunk.
+ * MSB of that sucker is stolen to mark taggings that we might have to
+ * revert - several operations have very unpleasant cleanup logics and
+ * that makes a difference. Some.
+ */
+
+static struct inotify_handle *rtree_ih;
+
+static struct audit_tree *alloc_tree(const char *s)
+{
+ struct audit_tree *tree;
+
+ tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ if (tree) {
+ atomic_set(&tree->count, 1);
+ tree->goner = 0;
+ INIT_LIST_HEAD(&tree->chunks);
+ INIT_LIST_HEAD(&tree->rules);
+ INIT_LIST_HEAD(&tree->list);
+ INIT_LIST_HEAD(&tree->same_root);
+ tree->root = NULL;
+ strcpy(tree->pathname, s);
+ }
+ return tree;
+}
+
+static inline void get_tree(struct audit_tree *tree)
+{
+ atomic_inc(&tree->count);
+}
+
+static void __put_tree(struct rcu_head *rcu)
+{
+ struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
+ kfree(tree);
+}
+
+static inline void put_tree(struct audit_tree *tree)
+{
+ if (atomic_dec_and_test(&tree->count))
+ call_rcu(&tree->head, __put_tree);
+}
+
+/* to avoid bringing the entire thing in audit.h */
+const char *audit_tree_path(struct audit_tree *tree)
+{
+ return tree->pathname;
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ inotify_init_watch(&chunk->watch);
+ return chunk;
+}
+
+static void free_chunk(struct audit_chunk *chunk)
+{
+ int i;
+
+ for (i = 0; i < chunk->count; i++) {
+ if (chunk->owners[i].owner)
+ put_tree(chunk->owners[i].owner);
+ }
+ kfree(chunk);
+}
+
+void audit_put_chunk(struct audit_chunk *chunk)
+{
+ if (atomic_long_dec_and_test(&chunk->refs))
+ free_chunk(chunk);
+}
+
+static void __put_chunk(struct rcu_head *rcu)
+{
+ struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+ audit_put_chunk(chunk);
+}
+
+enum {HASH_SIZE = 128};
+static struct list_head chunk_hash_heads[HASH_SIZE];
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
+
+static inline struct list_head *chunk_hash(const struct inode *inode)
+{
+ unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
+ return chunk_hash_heads + n % HASH_SIZE;
+}
+
+/* hash_lock is held by caller */
+static void insert_hash(struct audit_chunk *chunk)
+{
+ struct list_head *list = chunk_hash(chunk->watch.inode);
+ list_add_rcu(&chunk->hash, list);
+}
+
+/* called under rcu_read_lock */
+struct audit_chunk *audit_tree_lookup(const struct inode *inode)
+{
+ struct list_head *list = chunk_hash(inode);
+ struct audit_chunk *p;
+
+ list_for_each_entry_rcu(p, list, hash) {
+ if (p->watch.inode == inode) {
+ atomic_long_inc(&p->refs);
+ return p;
+ }
+ }
+ return NULL;
+}
+
+int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+{
+ int n;
+ for (n = 0; n < chunk->count; n++)
+ if (chunk->owners[n].owner == tree)
+ return 1;
+ return 0;
+}
+
+/* tagging and untagging inodes with trees */
+
+static struct audit_chunk *find_chunk(struct node *p)
+{
+ int index = p->index & ~(1U<<31);
+ p -= index;
+ return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
+{
+ struct audit_chunk *chunk = find_chunk(p);
+ struct audit_chunk *new;
+ struct audit_tree *owner;
+ int size = chunk->count - 1;
+ int i, j;
+
+ if (!pin_inotify_watch(&chunk->watch)) {
+ /*
+ * Filesystem is shutting down; all watches are getting
+ * evicted, just take it off the node list for this
+ * tree and let the eviction logics take care of the
+ * rest.
+ */
+ owner = p->owner;
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+ return;
+ }
+
+ spin_unlock(&hash_lock);
+
+ /*
+ * pin_inotify_watch() succeeded, so the watch won't go away
+ * from under us.
+ */
+ mutex_lock(&chunk->watch.inode->inotify_mutex);
+ if (chunk->dead) {
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ goto out;
+ }
+
+ owner = p->owner;
+
+ if (!size) {
+ chunk->dead = 1;
+ spin_lock(&hash_lock);
+ list_del_init(&chunk->trees);
+ if (owner->root == chunk)
+ owner->root = NULL;
+ list_del_init(&p->list);
+ list_del_rcu(&chunk->hash);
+ spin_unlock(&hash_lock);
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+ goto out;
+ }
+
+ new = alloc_chunk(size);
+ if (!new)
+ goto Fallback;
+ if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+ free_chunk(new);
+ goto Fallback;
+ }
+
+ chunk->dead = 1;
+ spin_lock(&hash_lock);
+ list_replace_init(&chunk->trees, &new->trees);
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+
+ for (i = j = 0; i < size; i++, j++) {
+ struct audit_tree *s;
+ if (&chunk->owners[j] == p) {
+ list_del_init(&p->list);
+ i--;
+ continue;
+ }
+ s = chunk->owners[j].owner;
+ new->owners[i].owner = s;
+ new->owners[i].index = chunk->owners[j].index - j + i;
+ if (!s) /* result of earlier fallback */
+ continue;
+ get_tree(s);
+ list_replace_init(&chunk->owners[i].list, &new->owners[j].list);
+ }
+
+ list_replace_rcu(&chunk->hash, &new->hash);
+ list_for_each_entry(owner, &new->trees, same_root)
+ owner->root = new;
+ spin_unlock(&hash_lock);
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+ goto out;
+
+Fallback:
+ // do the best we can
+ spin_lock(&hash_lock);
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+ unpin_inotify_watch(&chunk->watch);
+ spin_lock(&hash_lock);
+}
+
+static int create_chunk(struct inode *inode, struct audit_tree *tree)
+{
+ struct audit_chunk *chunk = alloc_chunk(1);
+ if (!chunk)
+ return -ENOMEM;
+
+ if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+ free_chunk(chunk);
+ return -ENOSPC;
+ }
+
+ mutex_lock(&inode->inotify_mutex);
+ spin_lock(&hash_lock);
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ chunk->dead = 1;
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+ return 0;
+ }
+ chunk->owners[0].index = (1U << 31);
+ chunk->owners[0].owner = tree;
+ get_tree(tree);
+ list_add(&chunk->owners[0].list, &tree->chunks);
+ if (!tree->root) {
+ tree->root = chunk;
+ list_add(&tree->same_root, &chunk->trees);
+ }
+ insert_hash(chunk);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&inode->inotify_mutex);
+ return 0;
+}
+
+/* the first tagged inode becomes root of tree */
+static int tag_chunk(struct inode *inode, struct audit_tree *tree)
+{
+ struct inotify_watch *watch;
+ struct audit_tree *owner;
+ struct audit_chunk *chunk, *old;
+ struct node *p;
+ int n;
+
+ if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+ return create_chunk(inode, tree);
+
+ old = container_of(watch, struct audit_chunk, watch);
+
+ /* are we already there? */
+ spin_lock(&hash_lock);
+ for (n = 0; n < old->count; n++) {
+ if (old->owners[n].owner == tree) {
+ spin_unlock(&hash_lock);
+ put_inotify_watch(watch);
+ return 0;
+ }
+ }
+ spin_unlock(&hash_lock);
+
+ chunk = alloc_chunk(old->count + 1);
+ if (!chunk)
+ return -ENOMEM;
+
+ mutex_lock(&inode->inotify_mutex);
+ if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
+ mutex_unlock(&inode->inotify_mutex);
+ free_chunk(chunk);
+ return -ENOSPC;
+ }
+ spin_lock(&hash_lock);
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ chunk->dead = 1;
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+ return 0;
+ }
+ list_replace_init(&old->trees, &chunk->trees);
+ for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
+ struct audit_tree *s = old->owners[n].owner;
+ p->owner = s;
+ p->index = old->owners[n].index;
+ if (!s) /* result of fallback in untag */
+ continue;
+ get_tree(s);
+ list_replace_init(&old->owners[n].list, &p->list);
+ }
+ p->index = (chunk->count - 1) | (1U<<31);
+ p->owner = tree;
+ get_tree(tree);
+ list_add(&p->list, &tree->chunks);
+ list_replace_rcu(&old->hash, &chunk->hash);
+ list_for_each_entry(owner, &chunk->trees, same_root)
+ owner->root = chunk;
+ old->dead = 1;
+ if (!tree->root) {
+ tree->root = chunk;
+ list_add(&tree->same_root, &chunk->trees);
+ }
+ spin_unlock(&hash_lock);
+ inotify_evict_watch(&old->watch);
+ mutex_unlock(&inode->inotify_mutex);
+ put_inotify_watch(&old->watch);
+ return 0;
+}
+
+static void kill_rules(struct audit_tree *tree)
+{
+ struct audit_krule *rule, *next;
+ struct audit_entry *entry;
+ struct audit_buffer *ab;
+
+ list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
+ entry = container_of(rule, struct audit_entry, rule);
+
+ list_del_init(&rule->rlist);
+ if (rule->tree) {
+ /* not a half-baked one */
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "op=remove rule dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ if (rule->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab, rule->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+ rule->tree = NULL;
+ list_del_rcu(&entry->list);
+ call_rcu(&entry->rcu, audit_free_rule_rcu);
+ }
+ }
+}
+
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+ spin_lock(&hash_lock);
+ while (!list_empty(&victim->chunks)) {
+ struct node *p;
+
+ p = list_entry(victim->chunks.next, struct node, list);
+
+ untag_chunk(p);
+ }
+ spin_unlock(&hash_lock);
+ put_tree(victim);
+}
+
+/* trim the uncommitted chunks from tree */
+
+static void trim_marked(struct audit_tree *tree)
+{
+ struct list_head *p, *q;
+ spin_lock(&hash_lock);
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ return;
+ }
+ /* reorder */
+ for (p = tree->chunks.next; p != &tree->chunks; p = q) {
+ struct node *node = list_entry(p, struct node, list);
+ q = p->next;
+ if (node->index & (1U<<31)) {
+ list_del_init(p);
+ list_add(p, &tree->chunks);
+ }
+ }
+
+ while (!list_empty(&tree->chunks)) {
+ struct node *node;
+
+ node = list_entry(tree->chunks.next, struct node, list);
+
+ /* have we run out of marked? */
+ if (!(node->index & (1U<<31)))
+ break;
+
+ untag_chunk(node);
+ }
+ if (!tree->root && !tree->goner) {
+ tree->goner = 1;
+ spin_unlock(&hash_lock);
+ mutex_lock(&audit_filter_mutex);
+ kill_rules(tree);
+ list_del_init(&tree->list);
+ mutex_unlock(&audit_filter_mutex);
+ prune_one(tree);
+ } else {
+ spin_unlock(&hash_lock);
+ }
+}
+
+/* called with audit_filter_mutex */
+int audit_remove_tree_rule(struct audit_krule *rule)
+{
+ struct audit_tree *tree;
+ tree = rule->tree;
+ if (tree) {
+ spin_lock(&hash_lock);
+ list_del_init(&rule->rlist);
+ if (list_empty(&tree->rules) && !tree->goner) {
+ tree->root = NULL;
+ list_del_init(&tree->same_root);
+ tree->goner = 1;
+ list_move(&tree->list, &prune_list);
+ rule->tree = NULL;
+ spin_unlock(&hash_lock);
+ audit_schedule_prune();
+ return 1;
+ }
+ rule->tree = NULL;
+ spin_unlock(&hash_lock);
+ return 1;
+ }
+ return 0;
+}
+
+void audit_trim_trees(void)
+{
+ struct list_head cursor;
+
+ mutex_lock(&audit_filter_mutex);
+ list_add(&cursor, &tree_list);
+ while (cursor.next != &tree_list) {
+ struct audit_tree *tree;
+ struct path path;
+ struct vfsmount *root_mnt;
+ struct node *node;
+ struct list_head list;
+ int err;
+
+ tree = container_of(cursor.next, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&cursor);
+ list_add(&cursor, &tree->list);
+ mutex_unlock(&audit_filter_mutex);
+
+ err = kern_path(tree->pathname, 0, &path);
+ if (err)
+ goto skip_it;
+
+ root_mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
+ if (!root_mnt)
+ goto skip_it;
+
+ list_add_tail(&list, &root_mnt->mnt_list);
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list) {
+ struct audit_chunk *chunk = find_chunk(node);
+ struct inode *inode = chunk->watch.inode;
+ struct vfsmount *mnt;
+ node->index |= 1U<<31;
+ list_for_each_entry(mnt, &list, mnt_list) {
+ if (mnt->mnt_root->d_inode == inode) {
+ node->index &= ~(1U<<31);
+ break;
+ }
+ }
+ }
+ spin_unlock(&hash_lock);
+ trim_marked(tree);
+ put_tree(tree);
+ list_del_init(&list);
+ drop_collected_mounts(root_mnt);
+skip_it:
+ mutex_lock(&audit_filter_mutex);
+ }
+ list_del(&cursor);
+ mutex_unlock(&audit_filter_mutex);
+}
+
+static int is_under(struct vfsmount *mnt, struct dentry *dentry,
+ struct path *path)
+{
+ if (mnt != path->mnt) {
+ for (;;) {
+ if (mnt->mnt_parent == mnt)
+ return 0;
+ if (mnt->mnt_parent == path->mnt)
+ break;
+ mnt = mnt->mnt_parent;
+ }
+ dentry = mnt->mnt_mountpoint;
+ }
+ return is_subdir(dentry, path->dentry);
+}
+
+int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
+{
+
+ if (pathname[0] != '/' ||
+ rule->listnr != AUDIT_FILTER_EXIT ||
+ op & ~AUDIT_EQUAL ||
+ rule->inode_f || rule->watch || rule->tree)
+ return -EINVAL;
+ rule->tree = alloc_tree(pathname);
+ if (!rule->tree)
+ return -ENOMEM;
+ return 0;
+}
+
+void audit_put_tree(struct audit_tree *tree)
+{
+ put_tree(tree);
+}
+
+/* called with audit_filter_mutex */
+int audit_add_tree_rule(struct audit_krule *rule)
+{
+ struct audit_tree *seed = rule->tree, *tree;
+ struct path path;
+ struct vfsmount *mnt, *p;
+ struct list_head list;
+ int err;
+
+ list_for_each_entry(tree, &tree_list, list) {
+ if (!strcmp(seed->pathname, tree->pathname)) {
+ put_tree(seed);
+ rule->tree = tree;
+ list_add(&rule->rlist, &tree->rules);
+ return 0;
+ }
+ }
+ tree = seed;
+ list_add(&tree->list, &tree_list);
+ list_add(&rule->rlist, &tree->rules);
+ /* do not set rule->tree yet */
+ mutex_unlock(&audit_filter_mutex);
+
+ err = kern_path(tree->pathname, 0, &path);
+ if (err)
+ goto Err;
+ mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
+ if (!mnt) {
+ err = -ENOMEM;
+ goto Err;
+ }
+ list_add_tail(&list, &mnt->mnt_list);
+
+ get_tree(tree);
+ list_for_each_entry(p, &list, mnt_list) {
+ err = tag_chunk(p->mnt_root->d_inode, tree);
+ if (err)
+ break;
+ }
+
+ list_del(&list);
+ drop_collected_mounts(mnt);
+
+ if (!err) {
+ struct node *node;
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list)
+ node->index &= ~(1U<<31);
+ spin_unlock(&hash_lock);
+ } else {
+ trim_marked(tree);
+ goto Err;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ if (list_empty(&rule->rlist)) {
+ put_tree(tree);
+ return -ENOENT;
+ }
+ rule->tree = tree;
+ put_tree(tree);
+
+ return 0;
+Err:
+ mutex_lock(&audit_filter_mutex);
+ list_del_init(&tree->list);
+ list_del_init(&tree->rules);
+ put_tree(tree);
+ return err;
+}
+
+int audit_tag_tree(char *old, char *new)
+{
+ struct list_head cursor, barrier;
+ int failed = 0;
+ struct path path;
+ struct vfsmount *tagged;
+ struct list_head list;
+ struct vfsmount *mnt;
+ struct dentry *dentry;
+ int err;
+
+ err = kern_path(new, 0, &path);
+ if (err)
+ return err;
+ tagged = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
+ if (!tagged)
+ return -ENOMEM;
+
+ err = kern_path(old, 0, &path);
+ if (err) {
+ drop_collected_mounts(tagged);
+ return err;
+ }
+ mnt = mntget(path.mnt);
+ dentry = dget(path.dentry);
+ path_put(&path);
+
+ if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
+ follow_up(&mnt, &dentry);
+
+ list_add_tail(&list, &tagged->mnt_list);
+
+ mutex_lock(&audit_filter_mutex);
+ list_add(&barrier, &tree_list);
+ list_add(&cursor, &barrier);
+
+ while (cursor.next != &tree_list) {
+ struct audit_tree *tree;
+ struct vfsmount *p;
+
+ tree = container_of(cursor.next, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&cursor);
+ list_add(&cursor, &tree->list);
+ mutex_unlock(&audit_filter_mutex);
+
+ err = kern_path(tree->pathname, 0, &path);
+ if (err) {
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ continue;
+ }
+
+ spin_lock(&vfsmount_lock);
+ if (!is_under(mnt, dentry, &path)) {
+ spin_unlock(&vfsmount_lock);
+ path_put(&path);
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ continue;
+ }
+ spin_unlock(&vfsmount_lock);
+ path_put(&path);
+
+ list_for_each_entry(p, &list, mnt_list) {
+ failed = tag_chunk(p->mnt_root->d_inode, tree);
+ if (failed)
+ break;
+ }
+
+ if (failed) {
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ break;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ spin_lock(&hash_lock);
+ if (!tree->goner) {
+ list_del(&tree->list);
+ list_add(&tree->list, &tree_list);
+ }
+ spin_unlock(&hash_lock);
+ put_tree(tree);
+ }
+
+ while (barrier.prev != &tree_list) {
+ struct audit_tree *tree;
+
+ tree = container_of(barrier.prev, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&tree->list);
+ list_add(&tree->list, &barrier);
+ mutex_unlock(&audit_filter_mutex);
+
+ if (!failed) {
+ struct node *node;
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list)
+ node->index &= ~(1U<<31);
+ spin_unlock(&hash_lock);
+ } else {
+ trim_marked(tree);
+ }
+
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ }
+ list_del(&barrier);
+ list_del(&cursor);
+ list_del(&list);
+ mutex_unlock(&audit_filter_mutex);
+ dput(dentry);
+ mntput(mnt);
+ drop_collected_mounts(tagged);
+ return failed;
+}
+
+/*
+ * That gets run when evict_chunk() ends up needing to kill audit_tree.
+ * Runs from a separate thread, with audit_cmd_mutex held.
+ */
+void audit_prune_trees(void)
+{
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(&prune_list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(prune_list.next, struct audit_tree, list);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+}
+
+/*
+ * Here comes the stuff asynchronous to auditctl operations
+ */
+
+/* inode->inotify_mutex is locked */
+static void evict_chunk(struct audit_chunk *chunk)
+{
+ struct audit_tree *owner;
+ int n;
+
+ if (chunk->dead)
+ return;
+
+ chunk->dead = 1;
+ mutex_lock(&audit_filter_mutex);
+ spin_lock(&hash_lock);
+ while (!list_empty(&chunk->trees)) {
+ owner = list_entry(chunk->trees.next,
+ struct audit_tree, same_root);
+ owner->goner = 1;
+ owner->root = NULL;
+ list_del_init(&owner->same_root);
+ spin_unlock(&hash_lock);
+ kill_rules(owner);
+ list_move(&owner->list, &prune_list);
+ audit_schedule_prune();
+ spin_lock(&hash_lock);
+ }
+ list_del_rcu(&chunk->hash);
+ for (n = 0; n < chunk->count; n++)
+ list_del_init(&chunk->owners[n].list);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&audit_filter_mutex);
+}
+
+static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+ u32 cookie, const char *dname, struct inode *inode)
+{
+ struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+
+ if (mask & IN_IGNORED) {
+ evict_chunk(chunk);
+ put_inotify_watch(watch);
+ }
+}
+
+static void destroy_watch(struct inotify_watch *watch)
+{
+ struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static const struct inotify_operations rtree_inotify_ops = {
+ .handle_event = handle_event,
+ .destroy_watch = destroy_watch,
+};
+
+static int __init audit_tree_init(void)
+{
+ int i;
+
+ rtree_ih = inotify_init(&rtree_inotify_ops);
+ if (IS_ERR(rtree_ih))
+ audit_panic("cannot initialize inotify handle for rectree watches");
+
+ for (i = 0; i < HASH_SIZE; i++)
+ INIT_LIST_HEAD(&chunk_hash_heads[i]);
+
+ return 0;
+}
+__initcall(audit_tree_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 0000000..9fd85a4
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,1854 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Locking model:
+ *
+ * audit_filter_mutex:
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * LSM rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
+ */
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_parent {
+ struct list_head ilist; /* entry in inotify registration list */
+ struct list_head watches; /* associated watches */
+ struct inotify_watch wdata; /* inotify watch data */
+ unsigned flags; /* status flags */
+};
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID 0x001
+
+/* Audit filter lists, defined in <linux/audit.h> */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+ LIST_HEAD_INIT(audit_filter_list[0]),
+ LIST_HEAD_INIT(audit_filter_list[1]),
+ LIST_HEAD_INIT(audit_filter_list[2]),
+ LIST_HEAD_INIT(audit_filter_list[3]),
+ LIST_HEAD_INIT(audit_filter_list[4]),
+ LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+
+DEFINE_MUTEX(audit_filter_mutex);
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+void audit_free_parent(struct inotify_watch *i_watch)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+static inline void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+static void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+static void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ put_inotify_watch(&watch->parent->wdata);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
+static inline void audit_free_rule(struct audit_entry *e)
+{
+ int i;
+
+ /* some rules don't have associated watches */
+ if (e->rule.watch)
+ audit_put_watch(e->rule.watch);
+ if (e->rule.fields)
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+ kfree(f->lsm_str);
+ security_audit_rule_free(f->lsm_rule);
+ }
+ kfree(e->rule.fields);
+ kfree(e->rule.filterkey);
+ kfree(e);
+}
+
+void audit_free_rule_rcu(struct rcu_head *head)
+{
+ struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+ audit_free_rule(e);
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+ struct audit_parent *parent;
+ s32 wd;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+ parent->flags = 0;
+
+ inotify_init_watch(&parent->wdata);
+ /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+ get_inotify_watch(&parent->wdata);
+ wd = inotify_add_watch(audit_ih, &parent->wdata,
+ ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+ if (wd < 0) {
+ audit_free_parent(&parent->wdata);
+ return ERR_PTR(wd);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
+/* Initialize an audit filterlist entry. */
+static inline struct audit_entry *audit_init_entry(u32 field_count)
+{
+ struct audit_entry *entry;
+ struct audit_field *fields;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (unlikely(!entry))
+ return NULL;
+
+ fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+ if (unlikely(!fields)) {
+ kfree(entry);
+ return NULL;
+ }
+ entry->rule.fields = fields;
+
+ return entry;
+}
+
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+ char *str;
+
+ if (!*bufp || (len == 0) || (len > *remain))
+ return ERR_PTR(-EINVAL);
+
+ /* Of the currently implemented string fields, PATH_MAX
+ * defines the longest valid length.
+ */
+ if (len > PATH_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ str = kmalloc(len + 1, GFP_KERNEL);
+ if (unlikely(!str))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(str, *bufp, len);
+ str[len] = 0;
+ *bufp += len;
+ *remain -= len;
+
+ return str;
+}
+
+/* Translate an inode field to kernel respresentation. */
+static inline int audit_to_inode(struct audit_krule *krule,
+ struct audit_field *f)
+{
+ if (krule->listnr != AUDIT_FILTER_EXIT ||
+ krule->watch || krule->inode_f || krule->tree)
+ return -EINVAL;
+
+ krule->inode_f = f;
+ return 0;
+}
+
+/* Translate a watch string to kernel respresentation. */
+static int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_ih)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op & ~AUDIT_EQUAL ||
+ krule->inode_f || krule->watch || krule->tree)
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (IS_ERR(watch))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
+static __u32 *classes[AUDIT_SYSCALL_CLASSES];
+
+int __init audit_register_class(int class, unsigned *list)
+{
+ __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ while (*list != ~0U) {
+ unsigned n = *list++;
+ if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
+ kfree(p);
+ return -EINVAL;
+ }
+ p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
+ }
+ if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
+ kfree(p);
+ return -EINVAL;
+ }
+ classes[class] = p;
+ return 0;
+}
+
+int audit_match_class(int class, unsigned syscall)
+{
+ if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
+ return 0;
+ if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
+ return 0;
+ return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static inline int audit_match_class_bits(int class, u32 *mask)
+{
+ int i;
+
+ if (classes[class]) {
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (mask[i] & classes[class][i])
+ return 0;
+ }
+ return 1;
+}
+
+static int audit_match_signal(struct audit_entry *entry)
+{
+ struct audit_field *arch = entry->rule.arch_f;
+
+ if (!arch) {
+ /* When arch is unspecified, we must check both masks on biarch
+ * as syscall number alone is ambiguous. */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+ entry->rule.mask) &&
+ audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+ entry->rule.mask));
+ }
+
+ switch(audit_classify_arch(arch->val)) {
+ case 0: /* native */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+ entry->rule.mask));
+ case 1: /* 32bit on biarch */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+ entry->rule.mask));
+ default:
+ return 1;
+ }
+}
+#endif
+
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+{
+ unsigned listnr;
+ struct audit_entry *entry;
+ int i, err;
+
+ err = -EINVAL;
+ listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+ switch(listnr) {
+ default:
+ goto exit_err;
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+ case AUDIT_FILTER_ENTRY:
+ case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_TASK:
+#endif
+ ;
+ }
+ if (unlikely(rule->action == AUDIT_POSSIBLE)) {
+ printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ goto exit_err;
+ }
+ if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
+ goto exit_err;
+ if (rule->field_count > AUDIT_MAX_FIELDS)
+ goto exit_err;
+
+ err = -ENOMEM;
+ entry = audit_init_entry(rule->field_count);
+ if (!entry)
+ goto exit_err;
+
+ entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+ entry->rule.listnr = listnr;
+ entry->rule.action = rule->action;
+ entry->rule.field_count = rule->field_count;
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ entry->rule.mask[i] = rule->mask[i];
+
+ for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
+ int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
+ __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
+ __u32 *class;
+
+ if (!(*p & AUDIT_BIT(bit)))
+ continue;
+ *p &= ~AUDIT_BIT(bit);
+ class = classes[i];
+ if (class) {
+ int j;
+ for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
+ entry->rule.mask[j] |= class[j];
+ }
+ }
+
+ return entry;
+
+exit_err:
+ return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule to kernel's rule respresentation.
+ * Exists for backward compatibility with userspace. */
+static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+{
+ struct audit_entry *entry;
+ struct audit_field *ino_f;
+ int err = 0;
+ int i;
+
+ entry = audit_to_entry_common(rule);
+ if (IS_ERR(entry))
+ goto exit_nofree;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &entry->rule.fields[i];
+
+ f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+ f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
+ f->val = rule->values[i];
+
+ err = -EINVAL;
+ switch(f->type) {
+ default:
+ goto exit_free;
+ case AUDIT_PID:
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_LOGINUID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ /* bit ops are only useful on syscall args */
+ if (f->op == AUDIT_BIT_MASK ||
+ f->op == AUDIT_BIT_TEST) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ break;
+ /* arch is only allowed to be = or != */
+ case AUDIT_ARCH:
+ if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL)
+ && (f->op != AUDIT_NEGATE) && (f->op)) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+ entry->rule.arch_f = f;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
+ case AUDIT_FILETYPE:
+ if ((f->val & ~S_IFMT) > S_IFMT)
+ goto exit_free;
+ break;
+ case AUDIT_INODE:
+ err = audit_to_inode(&entry->rule, f);
+ if (err)
+ goto exit_free;
+ break;
+ }
+
+ entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+
+ /* Support for legacy operators where
+ * AUDIT_NEGATE bit signifies != and otherwise assumes == */
+ if (f->op & AUDIT_NEGATE)
+ f->op = AUDIT_NOT_EQUAL;
+ else if (!f->op)
+ f->op = AUDIT_EQUAL;
+ else if (f->op == AUDIT_OPERATORS) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+ }
+
+ ino_f = entry->rule.inode_f;
+ if (ino_f) {
+ switch(ino_f->op) {
+ case AUDIT_NOT_EQUAL:
+ entry->rule.inode_f = NULL;
+ case AUDIT_EQUAL:
+ break;
+ default:
+ err = -EINVAL;
+ goto exit_free;
+ }
+ }
+
+exit_nofree:
+ return entry;
+
+exit_free:
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+ size_t datasz)
+{
+ int err = 0;
+ struct audit_entry *entry;
+ struct audit_field *ino_f;
+ void *bufp;
+ size_t remain = datasz - sizeof(struct audit_rule_data);
+ int i;
+ char *str;
+
+ entry = audit_to_entry_common((struct audit_rule *)data);
+ if (IS_ERR(entry))
+ goto exit_nofree;
+
+ bufp = data->buf;
+ entry->rule.vers_ops = 2;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &entry->rule.fields[i];
+
+ err = -EINVAL;
+ if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
+ data->fieldflags[i] & ~AUDIT_OPERATORS)
+ goto exit_free;
+
+ f->op = data->fieldflags[i] & AUDIT_OPERATORS;
+ f->type = data->fields[i];
+ f->val = data->values[i];
+ f->lsm_str = NULL;
+ f->lsm_rule = NULL;
+ switch(f->type) {
+ case AUDIT_PID:
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_LOGINUID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ break;
+ case AUDIT_ARCH:
+ entry->rule.arch_f = f;
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = security_audit_rule_init(f->type, f->op, str,
+ (void **)&f->lsm_rule);
+ /* Keep currently invalid fields around in case they
+ * become valid after a policy reload. */
+ if (err == -EINVAL) {
+ printk(KERN_WARNING "audit rule for LSM "
+ "\'%s\' is invalid\n", str);
+ err = 0;
+ }
+ if (err) {
+ kfree(str);
+ goto exit_free;
+ } else
+ f->lsm_str = str;
+ break;
+ case AUDIT_WATCH:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ if (err) {
+ kfree(str);
+ goto exit_free;
+ }
+ break;
+ case AUDIT_DIR:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = audit_make_tree(&entry->rule, str, f->op);
+ kfree(str);
+ if (err)
+ goto exit_free;
+ break;
+ case AUDIT_INODE:
+ err = audit_to_inode(&entry->rule, f);
+ if (err)
+ goto exit_free;
+ break;
+ case AUDIT_FILTERKEY:
+ err = -EINVAL;
+ if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+ entry->rule.filterkey = str;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
+ case AUDIT_FILETYPE:
+ if ((f->val & ~S_IFMT) > S_IFMT)
+ goto exit_free;
+ break;
+ default:
+ goto exit_free;
+ }
+ }
+
+ ino_f = entry->rule.inode_f;
+ if (ino_f) {
+ switch(ino_f->op) {
+ case AUDIT_NOT_EQUAL:
+ entry->rule.inode_f = NULL;
+ case AUDIT_EQUAL:
+ break;
+ default:
+ err = -EINVAL;
+ goto exit_free;
+ }
+ }
+
+exit_nofree:
+ return entry;
+
+exit_free:
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+}
+
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, const char *str)
+{
+ size_t len = strlen(str);
+
+ memcpy(*bufp, str, len);
+ *bufp += len;
+
+ return len;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule.
+ * Exists for backward compatibility with userspace. */
+static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
+{
+ struct audit_rule *rule;
+ int i;
+
+ rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+ if (unlikely(!rule))
+ return NULL;
+
+ rule->flags = krule->flags | krule->listnr;
+ rule->action = krule->action;
+ rule->field_count = krule->field_count;
+ for (i = 0; i < rule->field_count; i++) {
+ rule->values[i] = krule->fields[i].val;
+ rule->fields[i] = krule->fields[i].type;
+
+ if (krule->vers_ops == 1) {
+ if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+ rule->fields[i] |= AUDIT_NEGATE;
+ } else {
+ rule->fields[i] |= krule->fields[i].op;
+ }
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
+
+ return rule;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+ struct audit_rule_data *data;
+ void *bufp;
+ int i;
+
+ data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ if (unlikely(!data))
+ return NULL;
+ memset(data, 0, sizeof(*data));
+
+ data->flags = krule->flags | krule->listnr;
+ data->action = krule->action;
+ data->field_count = krule->field_count;
+ bufp = data->buf;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &krule->fields[i];
+
+ data->fields[i] = f->type;
+ data->fieldflags[i] = f->op;
+ switch(f->type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, f->lsm_str);
+ break;
+ case AUDIT_WATCH:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, krule->watch->path);
+ break;
+ case AUDIT_DIR:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp,
+ audit_tree_path(krule->tree));
+ break;
+ case AUDIT_FILTERKEY:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, krule->filterkey);
+ break;
+ default:
+ data->values[i] = f->val;
+ }
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+
+ return data;
+}
+
+/* Compare two rules in kernel format. Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+ int i;
+
+ if (a->flags != b->flags ||
+ a->listnr != b->listnr ||
+ a->action != b->action ||
+ a->field_count != b->field_count)
+ return 1;
+
+ for (i = 0; i < a->field_count; i++) {
+ if (a->fields[i].type != b->fields[i].type ||
+ a->fields[i].op != b->fields[i].op)
+ return 1;
+
+ switch(a->fields[i].type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
+ return 1;
+ break;
+ case AUDIT_WATCH:
+ if (strcmp(a->watch->path, b->watch->path))
+ return 1;
+ break;
+ case AUDIT_DIR:
+ if (strcmp(audit_tree_path(a->tree),
+ audit_tree_path(b->tree)))
+ return 1;
+ break;
+ case AUDIT_FILTERKEY:
+ /* both filterkeys exist based on above type compare */
+ if (strcmp(a->filterkey, b->filterkey))
+ return 1;
+ break;
+ default:
+ if (a->fields[i].val != b->fields[i].val)
+ return 1;
+ }
+ }
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (a->mask[i] != b->mask[i])
+ return 1;
+
+ return 0;
+}
+
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (IS_ERR(new)) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ get_inotify_watch(&old->parent->wdata);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
+/* Duplicate LSM field information. The lsm_rule is opaque, so must be
+ * re-initialized. */
+static inline int audit_dupe_lsm_field(struct audit_field *df,
+ struct audit_field *sf)
+{
+ int ret = 0;
+ char *lsm_str;
+
+ /* our own copy of lsm_str */
+ lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
+ if (unlikely(!lsm_str))
+ return -ENOMEM;
+ df->lsm_str = lsm_str;
+
+ /* our own (refreshed) copy of lsm_rule */
+ ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
+ (void **)&df->lsm_rule);
+ /* Keep currently invalid fields around in case they
+ * become valid after a policy reload. */
+ if (ret == -EINVAL) {
+ printk(KERN_WARNING "audit rule for LSM \'%s\' is "
+ "invalid\n", df->lsm_str);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/* Duplicate an audit rule. This will be a deep copy with the exception
+ * of the watch - that pointer is carried over. The LSM specific fields
+ * will be updated in the copy. The point is to be able to replace the old
+ * rule with the new rule in the filterlist, then free the old rule.
+ * The rlist element is undefined; list manipulations are handled apart from
+ * the initial copy. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch)
+{
+ u32 fcount = old->field_count;
+ struct audit_entry *entry;
+ struct audit_krule *new;
+ char *fk;
+ int i, err = 0;
+
+ entry = audit_init_entry(fcount);
+ if (unlikely(!entry))
+ return ERR_PTR(-ENOMEM);
+
+ new = &entry->rule;
+ new->vers_ops = old->vers_ops;
+ new->flags = old->flags;
+ new->listnr = old->listnr;
+ new->action = old->action;
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ new->mask[i] = old->mask[i];
+ new->buflen = old->buflen;
+ new->inode_f = old->inode_f;
+ new->watch = NULL;
+ new->field_count = old->field_count;
+ /*
+ * note that we are OK with not refcounting here; audit_match_tree()
+ * never dereferences tree and we can't get false positives there
+ * since we'd have to have rule gone from the list *and* removed
+ * before the chunks found by lookup had been allocated, i.e. before
+ * the beginning of list scan.
+ */
+ new->tree = old->tree;
+ memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
+
+ /* deep copy this information, updating the lsm_rule fields, because
+ * the originals will all be freed when the old rule is freed. */
+ for (i = 0; i < fcount; i++) {
+ switch (new->fields[i].type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ err = audit_dupe_lsm_field(&new->fields[i],
+ &old->fields[i]);
+ break;
+ case AUDIT_FILTERKEY:
+ fk = kstrdup(old->filterkey, GFP_KERNEL);
+ if (unlikely(!fk))
+ err = -ENOMEM;
+ else
+ new->filterkey = fk;
+ }
+ if (err) {
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+ }
+ }
+
+ if (watch) {
+ audit_get_watch(watch);
+ new->watch = watch;
+ }
+
+ return entry;
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path, NULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating && current->audit_context &&
+ audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
+ audit_set_auditable(current->audit_context);
+
+ nwatch = audit_dupe_watch(owatch);
+ if (IS_ERR(nwatch)) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ if (IS_ERR(nentry))
+ audit_panic("error updating watch, removing");
+ else {
+ int h = audit_hash_ino((u32)ino);
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ }
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_KERNEL,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "auid=%u ses=%u",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
+ audit_log_format(ab,
+ " op=updated rules specifying path=");
+ audit_log_untrustedstring(ab, owatch->path);
+ audit_log_format(ab, " with dev=%u ino=%lu\n",
+ dev, ino);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
+ }
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ parent->flags |= AUDIT_PARENT_INVALID;
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_KERNEL,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "auid=%u ses=%u",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
+ audit_log_format(ab, " op=remove rule path=");
+ audit_log_untrustedstring(ab, w->path);
+ if (r->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab,
+ r->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
+ audit_log_format(ab, " list=%d res=1",
+ r->listnr);
+ audit_log_end(ab);
+ }
+ list_del(&r->rlist);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+static void audit_inotify_unregister(struct list_head *in_list)
+{
+ struct audit_parent *p, *n;
+
+ list_for_each_entry_safe(p, n, in_list, ilist) {
+ list_del(&p->ilist);
+ inotify_rm_watch(audit_ih, &p->wdata);
+ /* the unpin matching the pin in audit_do_del_rule() */
+ unpin_inotify_watch(&p->wdata);
+ }
+}
+
+/* Find an existing audit rule.
+ * Caller must hold audit_filter_mutex to prevent stale rule data. */
+static struct audit_entry *audit_find_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e, *found = NULL;
+ int h;
+
+ if (entry->rule.watch) {
+ /* we don't know the inode number, so must walk entire hash */
+ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
+ list = &audit_inode_hash[h];
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+ }
+ goto out;
+ }
+
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+
+out:
+ return found;
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp,
+ struct nameidata **ndw)
+{
+ struct nameidata *ndparent, *ndwatch;
+ int err;
+
+ ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+ if (unlikely(!ndparent))
+ return -ENOMEM;
+
+ ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+ if (unlikely(!ndwatch)) {
+ kfree(ndparent);
+ return -ENOMEM;
+ }
+
+ err = path_lookup(path, LOOKUP_PARENT, ndparent);
+ if (err) {
+ kfree(ndparent);
+ kfree(ndwatch);
+ return err;
+ }
+
+ err = path_lookup(path, 0, ndwatch);
+ if (err) {
+ kfree(ndwatch);
+ ndwatch = NULL;
+ }
+
+ *ndp = ndparent;
+ *ndw = ndwatch;
+
+ return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+ if (ndp) {
+ path_put(&ndp->path);
+ kfree(ndp);
+ }
+ if (ndw) {
+ path_put(&ndw->path);
+ kfree(ndw);
+ }
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ get_inotify_watch(&parent->wdata);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
+ struct nameidata *ndw)
+{
+ struct audit_watch *watch = krule->watch;
+ struct inotify_watch *i_watch;
+ struct audit_parent *parent;
+ int ret = 0;
+
+ /* update watch filter fields */
+ if (ndw) {
+ watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->path.dentry->d_inode->i_ino;
+ }
+
+ /* The audit_filter_mutex must not be held during inotify calls because
+ * we hold it during inotify event callback processing. If an existing
+ * inotify watch is found, inotify_find_watch() grabs a reference before
+ * returning.
+ */
+ mutex_unlock(&audit_filter_mutex);
+
+ if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+ &i_watch) < 0) {
+ parent = audit_init_parent(ndp);
+ if (IS_ERR(parent)) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ return PTR_ERR(parent);
+ }
+ } else
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ mutex_lock(&audit_filter_mutex);
+
+ /* parent was moved before we took audit_filter_mutex */
+ if (parent->flags & AUDIT_PARENT_INVALID)
+ ret = -ENOENT;
+ else
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_init_parent or inotify_find_watch */
+ put_inotify_watch(&parent->wdata);
+ return ret;
+}
+
+/* Add rule to given filterlist if not a duplicate. */
+static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ struct audit_field *inode_f = entry->rule.inode_f;
+ struct audit_watch *watch = entry->rule.watch;
+ struct audit_tree *tree = entry->rule.tree;
+ struct nameidata *ndp = NULL, *ndw = NULL;
+ int h, err;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
+
+ if (inode_f) {
+ h = audit_hash_ino(inode_f->val);
+ list = &audit_inode_hash[h];
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, list);
+ mutex_unlock(&audit_filter_mutex);
+ if (e) {
+ err = -EEXIST;
+ /* normally audit_add_tree_rule() will free it on failure */
+ if (tree)
+ audit_put_tree(tree);
+ goto error;
+ }
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ if (watch) {
+ err = audit_get_nd(watch->path, &ndp, &ndw);
+ if (err)
+ goto error;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ if (watch) {
+ /* audit_filter_mutex is dropped and re-taken during this call */
+ err = audit_add_watch(&entry->rule, ndp, ndw);
+ if (err) {
+ mutex_unlock(&audit_filter_mutex);
+ goto error;
+ }
+ h = audit_hash_ino((u32)watch->ino);
+ list = &audit_inode_hash[h];
+ }
+ if (tree) {
+ err = audit_add_tree_rule(&entry->rule);
+ if (err) {
+ mutex_unlock(&audit_filter_mutex);
+ goto error;
+ }
+ }
+
+ if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+ list_add_rcu(&entry->list, list);
+ entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
+ } else {
+ list_add_tail_rcu(&entry->list, list);
+ }
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules++;
+
+ if (!audit_match_signal(entry))
+ audit_signals++;
+#endif
+ mutex_unlock(&audit_filter_mutex);
+
+ audit_put_nd(ndp, ndw); /* NULL args OK */
+ return 0;
+
+error:
+ audit_put_nd(ndp, ndw); /* NULL args OK */
+ if (watch)
+ audit_put_watch(watch); /* tmp watch, matches initial get */
+ return err;
+}
+
+/* Remove an existing rule from filterlist. */
+static inline int audit_del_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ struct audit_field *inode_f = entry->rule.inode_f;
+ struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+ struct audit_tree *tree = entry->rule.tree;
+ LIST_HEAD(inotify_list);
+ int h, ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
+
+ if (inode_f) {
+ h = audit_hash_ino(inode_f->val);
+ list = &audit_inode_hash[h];
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, list);
+ if (!e) {
+ mutex_unlock(&audit_filter_mutex);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ watch = e->rule.watch;
+ if (watch) {
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&e->rule.rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ /* Put parent on the inotify un-registration
+ * list. Grab a reference before releasing
+ * audit_filter_mutex, to be released in
+ * audit_inotify_unregister().
+ * If filesystem is going away, just leave
+ * the sucker alone, eviction will take
+ * care of it.
+ */
+ if (pin_inotify_watch(&parent->wdata))
+ list_add(&parent->ilist, &inotify_list);
+ }
+ }
+ }
+
+ if (e->rule.tree)
+ audit_remove_tree_rule(&e->rule);
+
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules--;
+
+ if (!audit_match_signal(entry))
+ audit_signals--;
+#endif
+ mutex_unlock(&audit_filter_mutex);
+
+ if (!list_empty(&inotify_list))
+ audit_inotify_unregister(&inotify_list);
+
+out:
+ if (tmp_watch)
+ audit_put_watch(tmp_watch); /* match initial get */
+ if (tree)
+ audit_put_tree(tree); /* that's the temporary one */
+
+ return ret;
+}
+
+/* List rules using struct audit_rule. Exists for backward
+ * compatibility with userspace. */
+static void audit_list(int pid, int seq, struct sk_buff_head *q)
+{
+ struct sk_buff *skb;
+ struct audit_entry *entry;
+ int i;
+
+ /* This is a blocking read, so use audit_filter_mutex instead of rcu
+ * iterator to sync with list writers. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(entry, &audit_filter_list[i], list) {
+ struct audit_rule *rule;
+
+ rule = audit_krule_to_rule(&entry->rule);
+ if (unlikely(!rule))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
+ rule, sizeof(*rule));
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(rule);
+ }
+ }
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
+ list_for_each_entry(entry, &audit_inode_hash[i], list) {
+ struct audit_rule *rule;
+
+ rule = audit_krule_to_rule(&entry->rule);
+ if (unlikely(!rule))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
+ rule, sizeof(*rule));
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(rule);
+ }
+ }
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+ if (skb)
+ skb_queue_tail(q, skb);
+}
+
+/* List rules using struct audit_rule_data. */
+static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
+{
+ struct sk_buff *skb;
+ struct audit_entry *e;
+ int i;
+
+ /* This is a blocking read, so use audit_filter_mutex instead of rcu
+ * iterator to sync with list writers. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(e, &audit_filter_list[i], list) {
+ struct audit_rule_data *data;
+
+ data = audit_krule_to_data(&e->rule);
+ if (unlikely(!data))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+ data, sizeof(*data) + data->buflen);
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(data);
+ }
+ }
+ for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
+ list_for_each_entry(e, &audit_inode_hash[i], list) {
+ struct audit_rule_data *data;
+
+ data = audit_krule_to_data(&e->rule);
+ if (unlikely(!data))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+ data, sizeof(*data) + data->buflen);
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(data);
+ }
+ }
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ if (skb)
+ skb_queue_tail(q, skb);
+}
+
+/* Log rule additions and removals */
+static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
+ char *action, struct audit_krule *rule,
+ int res)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (!ab)
+ return;
+ audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
+ if (sid) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(sid, &ctx, &len))
+ audit_log_format(ab, " ssid=%u", sid);
+ else {
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ audit_log_format(ab, " op=%s rule key=", action);
+ if (rule->filterkey)
+ audit_log_untrustedstring(ab, rule->filterkey);
+ else
+ audit_log_format(ab, "(null)");
+ audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @datasz: size of payload data
+ * @loginuid: loginuid of sender
+ * @sessionid: sessionid for netlink audit message
+ * @sid: SE Linux Security ID of sender
+ */
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+ size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+{
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+ struct audit_entry *entry;
+
+ switch (type) {
+ case AUDIT_LIST:
+ case AUDIT_LIST_RULES:
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->pid = pid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ if (type == AUDIT_LIST)
+ audit_list(pid, seq, &dest->q);
+ else
+ audit_list_rules(pid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+ break;
+ case AUDIT_ADD:
+ case AUDIT_ADD_RULE:
+ if (type == AUDIT_ADD)
+ entry = audit_rule_to_entry(data);
+ else
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ err = audit_add_rule(entry,
+ &audit_filter_list[entry->rule.listnr]);
+ audit_log_rule_change(loginuid, sessionid, sid, "add",
+ &entry->rule, !err);
+
+ if (err)
+ audit_free_rule(entry);
+ break;
+ case AUDIT_DEL:
+ case AUDIT_DEL_RULE:
+ if (type == AUDIT_DEL)
+ entry = audit_rule_to_entry(data);
+ else
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ err = audit_del_rule(entry,
+ &audit_filter_list[entry->rule.listnr]);
+ audit_log_rule_change(loginuid, sessionid, sid, "remove",
+ &entry->rule, !err);
+
+ audit_free_rule(entry);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return err;
+}
+
+int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+ switch (op) {
+ case AUDIT_EQUAL:
+ return (left == right);
+ case AUDIT_NOT_EQUAL:
+ return (left != right);
+ case AUDIT_LESS_THAN:
+ return (left < right);
+ case AUDIT_LESS_THAN_OR_EQUAL:
+ return (left <= right);
+ case AUDIT_GREATER_THAN:
+ return (left > right);
+ case AUDIT_GREATER_THAN_OR_EQUAL:
+ return (left >= right);
+ case AUDIT_BIT_MASK:
+ return (left & right);
+ case AUDIT_BIT_TEST:
+ return ((left & right) == right);
+ }
+ BUG();
+ return 0;
+}
+
+/* Compare given dentry name with last component in given path,
+ * return of 0 indicates a match. */
+int audit_compare_dname_path(const char *dname, const char *path,
+ int *dirlen)
+{
+ int dlen, plen;
+ const char *p;
+
+ if (!dname || !path)
+ return 1;
+
+ dlen = strlen(dname);
+ plen = strlen(path);
+ if (plen < dlen)
+ return 1;
+
+ /* disregard trailing slashes */
+ p = path + plen - 1;
+ while ((*p == '/') && (p > path))
+ p--;
+
+ /* find last path component */
+ p = p - dlen + 1;
+ if (p < path)
+ return 1;
+ else if (p > path) {
+ if (*--p != '/')
+ return 1;
+ else
+ p++;
+ }
+
+ /* return length of path's directory component */
+ if (dirlen)
+ *dirlen = p - path;
+ return strncmp(p, dname, dlen);
+}
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+ struct audit_krule *rule,
+ enum audit_state *state)
+{
+ int i;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &rule->fields[i];
+ int result = 0;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ result = audit_comparator(cb->creds.pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_comparator(cb->creds.uid, f->op, f->val);
+ break;
+ case AUDIT_GID:
+ result = audit_comparator(cb->creds.gid, f->op, f->val);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_comparator(cb->loginuid, f->op, f->val);
+ break;
+ }
+
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb)
+{
+ enum audit_state state = AUDIT_DISABLED;
+ struct audit_entry *e;
+ int ret = 1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+ if (audit_filter_user_rules(cb, &e->rule, &state)) {
+ if (state == AUDIT_DISABLED)
+ ret = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret; /* Audit by default */
+}
+
+int audit_filter_type(int type)
+{
+ struct audit_entry *e;
+ int result = 0;
+
+ rcu_read_lock();
+ if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+ goto unlock_and_return;
+
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+ list) {
+ int i;
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+ if (f->type == AUDIT_MSGTYPE) {
+ result = audit_comparator(type, f->op, f->val);
+ if (!result)
+ break;
+ }
+ }
+ if (result)
+ goto unlock_and_return;
+ }
+unlock_and_return:
+ rcu_read_unlock();
+ return result;
+}
+
+/* This function will re-initialize the lsm_rule field of all applicable rules.
+ * It will traverse the filter lists serarching for rules that contain LSM
+ * specific filter fields. When such a rule is found, it is copied, the
+ * LSM field is re-initialized, and the old rule is replaced with the
+ * updated rule. */
+int audit_update_lsm_rules(void)
+{
+ struct audit_entry *entry, *n, *nentry;
+ struct audit_watch *watch;
+ struct audit_tree *tree;
+ int i, err = 0;
+
+ /* audit_filter_mutex synchronizes the writers */
+ mutex_lock(&audit_filter_mutex);
+
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
+ if (!security_audit_rule_known(&entry->rule))
+ continue;
+
+ watch = entry->rule.watch;
+ tree = entry->rule.tree;
+ nentry = audit_dupe_rule(&entry->rule, watch);
+ if (IS_ERR(nentry)) {
+ /* save the first error encountered for the
+ * return value */
+ if (!err)
+ err = PTR_ERR(nentry);
+ audit_panic("error updating LSM filters");
+ if (watch)
+ list_del(&entry->rule.rlist);
+ list_del_rcu(&entry->list);
+ } else {
+ if (watch) {
+ list_add(&nentry->rule.rlist,
+ &watch->rules);
+ list_del(&entry->rule.rlist);
+ } else if (tree)
+ list_replace_init(&entry->rule.rlist,
+ &nentry->rule.rlist);
+ list_replace_rcu(&entry->list, &nentry->list);
+ }
+ call_rcu(&entry->rcu, audit_free_rule_rcu);
+ }
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+
+ return err;
+}
+
+/* Update watch data in audit rules based on inotify events. */
+void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+ u32 cookie, const char *dname, struct inode *inode)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev,
+ inode->i_ino, 0);
+ else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ /* inotify automatically removes the watch and sends IN_IGNORED */
+ else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+ audit_remove_parent_watches(parent);
+ /* inotify does not remove the watch, so remove it manually */
+ else if(mask & IN_MOVE_SELF) {
+ audit_remove_parent_watches(parent);
+ inotify_remove_watch_locked(audit_ih, i_watch);
+ } else if (mask & IN_IGNORED)
+ put_inotify_watch(i_watch);
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 0000000..2a3f0af
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,2489 @@
+/* auditsc.c -- System-call auditing support
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005, 2006 IBM Corporation
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
+ * 2006.
+ *
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
+ */
+
+#include <linux/init.h>
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/socket.h>
+#include <linux/mqueue.h>
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <linux/netlink.h>
+#include <linux/compiler.h>
+#include <asm/unistd.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/inotify.h>
+
+#include "audit.h"
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES 20
+
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
+/* no execve audit message should be longer than this (userspace limits) */
+#define MAX_EXECVE_AUDIT_LEN 7500
+
+/* number of audit rules */
+int audit_n_rules;
+
+/* determines whether we collect data for signals sent */
+int audit_signals;
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+ const char *name;
+ int name_len; /* number of name's characters to log */
+ unsigned name_put; /* call __putname() for this name */
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ uid_t uid;
+ gid_t gid;
+ dev_t rdev;
+ u32 osid;
+};
+
+struct audit_aux_data {
+ struct audit_aux_data *next;
+ int type;
+};
+
+#define AUDIT_AUX_IPCPERM 0
+
+/* Number of target pids per aux struct. */
+#define AUDIT_AUX_PIDS 16
+
+struct audit_aux_data_mq_open {
+ struct audit_aux_data d;
+ int oflag;
+ mode_t mode;
+ struct mq_attr attr;
+};
+
+struct audit_aux_data_mq_sendrecv {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+};
+
+struct audit_aux_data_mq_notify {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ struct sigevent notification;
+};
+
+struct audit_aux_data_mq_getsetattr {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+};
+
+struct audit_aux_data_ipcctl {
+ struct audit_aux_data d;
+ struct ipc_perm p;
+ unsigned long qbytes;
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+ u32 osid;
+};
+
+struct audit_aux_data_execve {
+ struct audit_aux_data d;
+ int argc;
+ int envc;
+ struct mm_struct *mm;
+};
+
+struct audit_aux_data_socketcall {
+ struct audit_aux_data d;
+ int nargs;
+ unsigned long args[0];
+};
+
+struct audit_aux_data_sockaddr {
+ struct audit_aux_data d;
+ int len;
+ char a[0];
+};
+
+struct audit_aux_data_fd_pair {
+ struct audit_aux_data d;
+ int fd[2];
+};
+
+struct audit_aux_data_pids {
+ struct audit_aux_data d;
+ pid_t target_pid[AUDIT_AUX_PIDS];
+ uid_t target_auid[AUDIT_AUX_PIDS];
+ uid_t target_uid[AUDIT_AUX_PIDS];
+ unsigned int target_sessionid[AUDIT_AUX_PIDS];
+ u32 target_sid[AUDIT_AUX_PIDS];
+ char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
+ int pid_count;
+};
+
+struct audit_tree_refs {
+ struct audit_tree_refs *next;
+ struct audit_chunk *c[31];
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state;
+ unsigned int serial; /* serial number for record */
+ struct timespec ctime; /* time of syscall entry */
+ int major; /* syscall number */
+ unsigned long argv[4]; /* syscall arguments */
+ int return_valid; /* return code is valid */
+ long return_code;/* syscall return code */
+ int auditable; /* 1 if record should be written */
+ int name_count;
+ struct audit_names names[AUDIT_NAMES];
+ char * filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_context *previous; /* For nested syscalls */
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ uid_t uid, euid, suid, fsuid;
+ gid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ uid_t target_auid;
+ uid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ int tree_count;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
+#endif
+};
+
+#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
+static inline int open_arg(int flags, int mask)
+{
+ int n = ACC_MODE(flags);
+ if (flags & (O_TRUNC | O_CREAT))
+ n |= AUDIT_PERM_WRITE;
+ return n & mask;
+}
+
+static int audit_match_perm(struct audit_context *ctx, int mask)
+{
+ unsigned n;
+ if (unlikely(!ctx))
+ return 0;
+ n = ctx->major;
+
+ switch (audit_classify_syscall(ctx->arch, n)) {
+ case 0: /* native */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR, n))
+ return 1;
+ return 0;
+ case 1: /* 32bit on biarch */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR_32, n))
+ return 1;
+ return 0;
+ case 2: /* open */
+ return mask & ACC_MODE(ctx->argv[1]);
+ case 3: /* openat */
+ return mask & ACC_MODE(ctx->argv[2]);
+ case 4: /* socketcall */
+ return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
+ case 5: /* execve */
+ return mask & AUDIT_PERM_EXEC;
+ default:
+ return 0;
+ }
+}
+
+static int audit_match_filetype(struct audit_context *ctx, int which)
+{
+ unsigned index = which & ~S_IFMT;
+ mode_t mode = which & S_IFMT;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (index >= ctx->name_count)
+ return 0;
+ if (ctx->names[index].ino == -1)
+ return 0;
+ if ((ctx->names[index].mode ^ mode) & S_IFMT)
+ return 0;
+ return 1;
+}
+
+/*
+ * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
+ * ->first_trees points to its beginning, ->trees - to the current end of data.
+ * ->tree_count is the number of free entries in array pointed to by ->trees.
+ * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL,
+ * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously,
+ * it's going to remain 1-element for almost any setup) until we free context itself.
+ * References in it _are_ dropped - at the same time we free/drop aux stuff.
+ */
+
+#ifdef CONFIG_AUDIT_TREE
+static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
+{
+ struct audit_tree_refs *p = ctx->trees;
+ int left = ctx->tree_count;
+ if (likely(left)) {
+ p->c[--left] = chunk;
+ ctx->tree_count = left;
+ return 1;
+ }
+ if (!p)
+ return 0;
+ p = p->next;
+ if (p) {
+ p->c[30] = chunk;
+ ctx->trees = p;
+ ctx->tree_count = 30;
+ return 1;
+ }
+ return 0;
+}
+
+static int grow_tree_refs(struct audit_context *ctx)
+{
+ struct audit_tree_refs *p = ctx->trees;
+ ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
+ if (!ctx->trees) {
+ ctx->trees = p;
+ return 0;
+ }
+ if (p)
+ p->next = ctx->trees;
+ else
+ ctx->first_trees = ctx->trees;
+ ctx->tree_count = 31;
+ return 1;
+}
+#endif
+
+static void unroll_tree_refs(struct audit_context *ctx,
+ struct audit_tree_refs *p, int count)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_tree_refs *q;
+ int n;
+ if (!p) {
+ /* we started with empty chain */
+ p = ctx->first_trees;
+ count = 31;
+ /* if the very first allocation has failed, nothing to do */
+ if (!p)
+ return;
+ }
+ n = count;
+ for (q = p; q != ctx->trees; q = q->next, n = 31) {
+ while (n--) {
+ audit_put_chunk(q->c[n]);
+ q->c[n] = NULL;
+ }
+ }
+ while (n-- > ctx->tree_count) {
+ audit_put_chunk(q->c[n]);
+ q->c[n] = NULL;
+ }
+ ctx->trees = p;
+ ctx->tree_count = count;
+#endif
+}
+
+static void free_tree_refs(struct audit_context *ctx)
+{
+ struct audit_tree_refs *p, *q;
+ for (p = ctx->first_trees; p; p = q) {
+ q = p->next;
+ kfree(p);
+ }
+}
+
+static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_tree_refs *p;
+ int n;
+ if (!tree)
+ return 0;
+ /* full ones */
+ for (p = ctx->first_trees; p != ctx->trees; p = p->next) {
+ for (n = 0; n < 31; n++)
+ if (audit_tree_match(p->c[n], tree))
+ return 1;
+ }
+ /* partial */
+ if (p) {
+ for (n = ctx->tree_count; n < 31; n++)
+ if (audit_tree_match(p->c[n], tree))
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/* Determine if any context name data matches a rule's watch data */
+/* Compare a task_struct with an audit_rule. Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+ struct audit_krule *rule,
+ struct audit_context *ctx,
+ struct audit_names *name,
+ enum audit_state *state)
+{
+ int i, j, need_sid = 1;
+ u32 sid;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &rule->fields[i];
+ int result = 0;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ result = audit_comparator(tsk->pid, f->op, f->val);
+ break;
+ case AUDIT_PPID:
+ if (ctx) {
+ if (!ctx->ppid)
+ ctx->ppid = sys_getppid();
+ result = audit_comparator(ctx->ppid, f->op, f->val);
+ }
+ break;
+ case AUDIT_UID:
+ result = audit_comparator(tsk->uid, f->op, f->val);
+ break;
+ case AUDIT_EUID:
+ result = audit_comparator(tsk->euid, f->op, f->val);
+ break;
+ case AUDIT_SUID:
+ result = audit_comparator(tsk->suid, f->op, f->val);
+ break;
+ case AUDIT_FSUID:
+ result = audit_comparator(tsk->fsuid, f->op, f->val);
+ break;
+ case AUDIT_GID:
+ result = audit_comparator(tsk->gid, f->op, f->val);
+ break;
+ case AUDIT_EGID:
+ result = audit_comparator(tsk->egid, f->op, f->val);
+ break;
+ case AUDIT_SGID:
+ result = audit_comparator(tsk->sgid, f->op, f->val);
+ break;
+ case AUDIT_FSGID:
+ result = audit_comparator(tsk->fsgid, f->op, f->val);
+ break;
+ case AUDIT_PERS:
+ result = audit_comparator(tsk->personality, f->op, f->val);
+ break;
+ case AUDIT_ARCH:
+ if (ctx)
+ result = audit_comparator(ctx->arch, f->op, f->val);
+ break;
+
+ case AUDIT_EXIT:
+ if (ctx && ctx->return_valid)
+ result = audit_comparator(ctx->return_code, f->op, f->val);
+ break;
+ case AUDIT_SUCCESS:
+ if (ctx && ctx->return_valid) {
+ if (f->val)
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
+ else
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
+ }
+ break;
+ case AUDIT_DEVMAJOR:
+ if (name)
+ result = audit_comparator(MAJOR(name->dev),
+ f->op, f->val);
+ else if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_DEVMINOR:
+ if (name)
+ result = audit_comparator(MINOR(name->dev),
+ f->op, f->val);
+ else if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_INODE:
+ if (name)
+ result = (name->ino == f->val);
+ else if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_WATCH:
+ if (name && rule->watch->ino != (unsigned long)-1)
+ result = (name->dev == rule->watch->dev &&
+ name->ino == rule->watch->ino);
+ break;
+ case AUDIT_DIR:
+ if (ctx)
+ result = match_tree_refs(ctx, rule->tree);
+ break;
+ case AUDIT_LOGINUID:
+ result = 0;
+ if (ctx)
+ result = audit_comparator(tsk->loginuid, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ /* NOTE: this may return negative values indicating
+ a temporary error. We simply treat this as a
+ match for now to avoid losing information that
+ may be wanted. An error message will also be
+ logged upon error */
+ if (f->lsm_rule) {
+ if (need_sid) {
+ security_task_getsecid(tsk, &sid);
+ need_sid = 0;
+ }
+ result = security_audit_rule_match(sid, f->type,
+ f->op,
+ f->lsm_rule,
+ ctx);
+ }
+ break;
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
+ also applies here */
+ if (f->lsm_rule) {
+ /* Find files that match */
+ if (name) {
+ result = security_audit_rule_match(
+ name->osid, f->type, f->op,
+ f->lsm_rule, ctx);
+ } else if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (security_audit_rule_match(
+ ctx->names[j].osid,
+ f->type, f->op,
+ f->lsm_rule, ctx)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ /* Find ipc objects that match */
+ if (ctx) {
+ struct audit_aux_data *aux;
+ for (aux = ctx->aux; aux;
+ aux = aux->next) {
+ if (aux->type == AUDIT_IPC) {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ }
+ }
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ if (ctx)
+ result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
+ break;
+ case AUDIT_FILTERKEY:
+ /* ignore this field for filtering */
+ result = 1;
+ break;
+ case AUDIT_PERM:
+ result = audit_match_perm(ctx, f->val);
+ break;
+ case AUDIT_FILETYPE:
+ result = audit_match_filetype(ctx, f->val);
+ break;
+ }
+
+ if (!result)
+ return 0;
+ }
+ if (rule->filterkey && ctx)
+ ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task. Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write an audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return AUDIT_DISABLED;
+
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall exit time, this filter is called if any audit_names[] have been
+ * collected during syscall processing. We only check rules in sublists at hash
+ * buckets applicable to the inode numbers in audit_names[].
+ * Regarding audit_state, same rules apply as for audit_filter_syscall().
+ */
+enum audit_state audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ int i;
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return AUDIT_DISABLED;
+
+ rcu_read_lock();
+ for (i = 0; i < ctx->name_count; i++) {
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+ struct audit_names *n = &ctx->names[i];
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+
+ if (list_empty(list))
+ continue;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+void audit_set_auditable(struct audit_context *ctx)
+{
+ ctx->auditable = 1;
+}
+
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+ int return_valid,
+ int return_code)
+{
+ struct audit_context *context = tsk->audit_context;
+
+ if (likely(!context))
+ return NULL;
+ context->return_valid = return_valid;
+
+ /*
+ * we need to fix up the return code in the audit logs if the actual
+ * return codes are later going to be fixed up by the arch specific
+ * signal handlers
+ *
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(return_code <= -ERESTARTSYS) &&
+ (return_code >= -ERESTART_RESTARTBLOCK) &&
+ (return_code != -ENOIOCTLCMD))
+ context->return_code = -EINTR;
+ else
+ context->return_code = return_code;
+
+ if (context->in_syscall && !context->dummy && !context->auditable) {
+ enum audit_state state;
+
+ state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+ if (state == AUDIT_RECORD_CONTEXT) {
+ context->auditable = 1;
+ goto get_context;
+ }
+
+ state = audit_filter_inodes(tsk, context);
+ if (state == AUDIT_RECORD_CONTEXT)
+ context->auditable = 1;
+
+ }
+
+get_context:
+
+ tsk->audit_context = NULL;
+ return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+ int i;
+
+#if AUDIT_DEBUG == 2
+ if (context->auditable
+ ||context->put_count + context->ino_count != context->name_count) {
+ printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d"
+ " ino_count=%d [NOT freeing]\n",
+ __FILE__, __LINE__,
+ context->serial, context->major, context->in_syscall,
+ context->name_count, context->put_count,
+ context->ino_count);
+ for (i = 0; i < context->name_count; i++) {
+ printk(KERN_ERR "names[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name ?: "(null)");
+ }
+ dump_stack();
+ return;
+ }
+#endif
+#if AUDIT_DEBUG
+ context->put_count = 0;
+ context->ino_count = 0;
+#endif
+
+ for (i = 0; i < context->name_count; i++) {
+ if (context->names[i].name && context->names[i].name_put)
+ __putname(context->names[i].name);
+ }
+ context->name_count = 0;
+ path_put(&context->pwd);
+ context->pwd.dentry = NULL;
+ context->pwd.mnt = NULL;
+}
+
+static inline void audit_free_aux(struct audit_context *context)
+{
+ struct audit_aux_data *aux;
+
+ while ((aux = context->aux)) {
+ context->aux = aux->next;
+ kfree(aux);
+ }
+ while ((aux = context->aux_pids)) {
+ context->aux_pids = aux->next;
+ kfree(aux);
+ }
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+ enum audit_state state)
+{
+ memset(context, 0, sizeof(*context));
+ context->state = state;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+ struct audit_context *context;
+
+ if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ return NULL;
+ audit_zero_context(context, state);
+ return context;
+}
+
+/**
+ * audit_alloc - allocate an audit context block for a task
+ * @tsk: task
+ *
+ * Filter on the task information and allocate a per-task audit context
+ * if necessary. Doing so turns on system call auditing for the
+ * specified task. This is called from copy_process, so no lock is
+ * needed.
+ */
+int audit_alloc(struct task_struct *tsk)
+{
+ struct audit_context *context;
+ enum audit_state state;
+
+ if (likely(!audit_ever_enabled))
+ return 0; /* Return if not auditing. */
+
+ state = audit_filter_task(tsk);
+ if (likely(state == AUDIT_DISABLED))
+ return 0;
+
+ if (!(context = audit_alloc_context(state))) {
+ audit_log_lost("out of memory in audit_alloc");
+ return -ENOMEM;
+ }
+
+ tsk->audit_context = context;
+ set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+ struct audit_context *previous;
+ int count = 0;
+
+ do {
+ previous = context->previous;
+ if (previous || (count && count < 10)) {
+ ++count;
+ printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+ " freeing multiple contexts (%d)\n",
+ context->serial, context->major,
+ context->name_count, count);
+ }
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context);
+ context = previous;
+ } while (context);
+ if (count >= 10)
+ printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+void audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return;
+}
+
+EXPORT_SYMBOL(audit_log_task_context);
+
+static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ struct vm_area_struct *vma;
+
+ /* tsk == current */
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ vma = mm->mmap;
+ while (vma) {
+ if ((vma->vm_flags & VM_EXECUTABLE) &&
+ vma->vm_file) {
+ audit_log_d_path(ab, "exe=",
+ &vma->vm_file->f_path);
+ break;
+ }
+ vma = vma->vm_next;
+ }
+ up_read(&mm->mmap_sem);
+ }
+ audit_log_task_context(ab);
+}
+
+static int audit_log_pid_context(struct audit_context *context, pid_t pid,
+ uid_t auid, uid_t uid, unsigned int sessionid,
+ u32 sid, char *comm)
+{
+ struct audit_buffer *ab;
+ char *ctx = NULL;
+ u32 len;
+ int rc = 0;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
+ if (!ab)
+ return rc;
+
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
+ uid, sessionid);
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ audit_log_format(ab, " ocomm=");
+ audit_log_untrustedstring(ab, comm);
+ audit_log_end(ab);
+
+ return rc;
+}
+
+/*
+ * to_send and len_sent accounting are very loose estimates. We aren't
+ * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
+ * within about 500 bytes (next page boundry)
+ *
+ * why snprintf? an int is up to 12 digits long. if we just assumed when
+ * logging that a[%d]= was going to be 16 characters long we would be wasting
+ * space in every audit message. In one 7500 byte message we can log up to
+ * about 1000 min size arguments. That comes down to about 50% waste of space
+ * if we didn't do the snprintf to find out how long arg_num_len was.
+ */
+static int audit_log_single_execve_arg(struct audit_context *context,
+ struct audit_buffer **ab,
+ int arg_num,
+ size_t *len_sent,
+ const char __user *p,
+ char *buf)
+{
+ char arg_num_len_buf[12];
+ const char __user *tmp_p = p;
+ /* how many digits are in arg_num? 3 is the length of a=\n */
+ size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+ size_t len, len_left, to_send;
+ size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
+ unsigned int i, has_cntl = 0, too_long = 0;
+ int ret;
+
+ /* strnlen_user includes the null we don't want to send */
+ len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /*
+ * We just created this mm, if we can't find the strings
+ * we just copied into it something is _very_ wrong. Similar
+ * for strings that are too long, we should not have created
+ * any.
+ */
+ if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+
+ /* walk the whole argument looking for non-ascii chars */
+ do {
+ if (len_left > MAX_EXECVE_AUDIT_LEN)
+ to_send = MAX_EXECVE_AUDIT_LEN;
+ else
+ to_send = len_left;
+ ret = copy_from_user(buf, tmp_p, to_send);
+ /*
+ * There is no reason for this copy to be short. We just
+ * copied them here, and the mm hasn't been exposed to user-
+ * space yet.
+ */
+ if (ret) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+ buf[to_send] = '\0';
+ has_cntl = audit_string_contains_control(buf, to_send);
+ if (has_cntl) {
+ /*
+ * hex messages get logged as 2 bytes, so we can only
+ * send half as much in each message
+ */
+ max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
+ break;
+ }
+ len_left -= to_send;
+ tmp_p += to_send;
+ } while (len_left > 0);
+
+ len_left = len;
+
+ if (len > max_execve_audit_len)
+ too_long = 1;
+
+ /* rewalk the argument actually logging the message */
+ for (i = 0; len_left > 0; i++) {
+ int room_left;
+
+ if (len_left > max_execve_audit_len)
+ to_send = max_execve_audit_len;
+ else
+ to_send = len_left;
+
+ /* do we have space left to send this argument in this ab? */
+ room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
+ if (has_cntl)
+ room_left -= (to_send * 2);
+ else
+ room_left -= to_send;
+ if (room_left < 0) {
+ *len_sent = 0;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ return 0;
+ }
+
+ /*
+ * first record needs to say how long the original string was
+ * so we can be sure nothing was lost.
+ */
+ if ((i == 0) && (too_long))
+ audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+ has_cntl ? 2*len : len);
+
+ /*
+ * normally arguments are small enough to fit and we already
+ * filled buf above when we checked for control characters
+ * so don't bother with another copy_from_user
+ */
+ if (len >= max_execve_audit_len)
+ ret = copy_from_user(buf, p, to_send);
+ else
+ ret = 0;
+ if (ret) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+ buf[to_send] = '\0';
+
+ /* actually log it */
+ audit_log_format(*ab, "a%d", arg_num);
+ if (too_long)
+ audit_log_format(*ab, "[%d]", i);
+ audit_log_format(*ab, "=");
+ if (has_cntl)
+ audit_log_n_hex(*ab, buf, to_send);
+ else
+ audit_log_format(*ab, "\"%s\"", buf);
+ audit_log_format(*ab, "\n");
+
+ p += to_send;
+ len_left -= to_send;
+ *len_sent += arg_num_len;
+ if (has_cntl)
+ *len_sent += to_send * 2;
+ else
+ *len_sent += to_send;
+ }
+ /* include the null we didn't log */
+ return len + 1;
+}
+
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab,
+ struct audit_aux_data_execve *axi)
+{
+ int i;
+ size_t len, len_sent = 0;
+ const char __user *p;
+ char *buf;
+
+ if (axi->mm != current->mm)
+ return; /* execve failed, no additional info */
+
+ p = (const char __user *)axi->mm->arg_start;
+
+ audit_log_format(*ab, "argc=%d ", axi->argc);
+
+ /*
+ * we need some kernel buffer to hold the userspace args. Just
+ * allocate one big one rather than allocating one of the right size
+ * for every single argument inside audit_log_single_execve_arg()
+ * should be <8k allocation so should be pretty safe.
+ */
+ buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf) {
+ audit_panic("out of memory for argv string\n");
+ return;
+ }
+
+ for (i = 0; i < axi->argc; i++) {
+ len = audit_log_single_execve_arg(context, ab, i,
+ &len_sent, p, buf);
+ if (len <= 0)
+ break;
+ p += len;
+ }
+ kfree(buf);
+}
+
+static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+{
+ int i, call_panic = 0;
+ struct audit_buffer *ab;
+ struct audit_aux_data *aux;
+ const char *tty;
+
+ /* tsk == current */
+ context->pid = tsk->pid;
+ if (!context->ppid)
+ context->ppid = sys_getppid();
+ context->uid = tsk->uid;
+ context->gid = tsk->gid;
+ context->euid = tsk->euid;
+ context->suid = tsk->suid;
+ context->fsuid = tsk->fsuid;
+ context->egid = tsk->egid;
+ context->sgid = tsk->sgid;
+ context->fsgid = tsk->fsgid;
+ context->personality = tsk->personality;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return; /* audit_panic has been called */
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
+ context->return_code);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count,
+ context->ppid,
+ context->pid,
+ tsk->loginuid,
+ context->uid,
+ context->gid,
+ context->euid, context->suid, context->fsuid,
+ context->egid, context->sgid, context->fsgid, tty,
+ tsk->sessionid);
+
+
+ audit_log_task_info(ab, tsk);
+ if (context->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab, context->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
+ audit_log_end(ab);
+
+ for (aux = context->aux; aux; aux = aux->next) {
+
+ ab = audit_log_start(context, GFP_KERNEL, aux->type);
+ if (!ab)
+ continue; /* audit_panic has been called */
+
+ switch (aux->type) {
+ case AUDIT_MQ_OPEN: {
+ struct audit_aux_data_mq_open *axi = (void *)aux;
+ audit_log_format(ab,
+ "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+ "mq_msgsize=%ld mq_curmsgs=%ld",
+ axi->oflag, axi->mode, axi->attr.mq_flags,
+ axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
+ axi->attr.mq_curmsgs);
+ break; }
+
+ case AUDIT_MQ_SENDRECV: {
+ struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d msg_len=%zd msg_prio=%u "
+ "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ axi->mqdes, axi->msg_len, axi->msg_prio,
+ axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
+ break; }
+
+ case AUDIT_MQ_NOTIFY: {
+ struct audit_aux_data_mq_notify *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d sigev_signo=%d",
+ axi->mqdes,
+ axi->notification.sigev_signo);
+ break; }
+
+ case AUDIT_MQ_GETSETATTR: {
+ struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+ "mq_curmsgs=%ld ",
+ axi->mqdes,
+ axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
+ axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
+ break; }
+
+ case AUDIT_IPC: {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ audit_log_format(ab,
+ "ouid=%u ogid=%u mode=%#o",
+ axi->uid, axi->gid, axi->mode);
+ if (axi->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ axi->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u",
+ axi->osid);
+ call_panic = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ break; }
+
+ case AUDIT_IPC_SET_PERM: {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ audit_log_format(ab,
+ "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+ axi->qbytes, axi->uid, axi->gid, axi->mode);
+ break; }
+
+ case AUDIT_EXECVE: {
+ struct audit_aux_data_execve *axi = (void *)aux;
+ audit_log_execve_info(context, &ab, axi);
+ break; }
+
+ case AUDIT_SOCKETCALL: {
+ struct audit_aux_data_socketcall *axs = (void *)aux;
+ audit_log_format(ab, "nargs=%d", axs->nargs);
+ for (i=0; i<axs->nargs; i++)
+ audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
+ break; }
+
+ case AUDIT_SOCKADDR: {
+ struct audit_aux_data_sockaddr *axs = (void *)aux;
+
+ audit_log_format(ab, "saddr=");
+ audit_log_n_hex(ab, axs->a, axs->len);
+ break; }
+
+ case AUDIT_FD_PAIR: {
+ struct audit_aux_data_fd_pair *axs = (void *)aux;
+ audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
+ break; }
+
+ }
+ audit_log_end(ab);
+ }
+
+ for (aux = context->aux_pids; aux; aux = aux->next) {
+ struct audit_aux_data_pids *axs = (void *)aux;
+
+ for (i = 0; i < axs->pid_count; i++)
+ if (audit_log_pid_context(context, axs->target_pid[i],
+ axs->target_auid[i],
+ axs->target_uid[i],
+ axs->target_sessionid[i],
+ axs->target_sid[i],
+ axs->target_comm[i]))
+ call_panic = 1;
+ }
+
+ if (context->target_pid &&
+ audit_log_pid_context(context, context->target_pid,
+ context->target_auid, context->target_uid,
+ context->target_sessionid,
+ context->target_sid, context->target_comm))
+ call_panic = 1;
+
+ if (context->pwd.dentry && context->pwd.mnt) {
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
+ if (ab) {
+ audit_log_d_path(ab, "cwd=", &context->pwd);
+ audit_log_end(ab);
+ }
+ }
+ for (i = 0; i < context->name_count; i++) {
+ struct audit_names *n = &context->names[i];
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ continue; /* audit_panic has been called */
+
+ audit_log_format(ab, "item=%d", i);
+
+ if (n->name) {
+ switch(n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#o"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ n->uid,
+ n->gid,
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ audit_log_end(ab);
+ }
+
+ /* Send end of event record to help user space know we are finished */
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
+ if (ab)
+ audit_log_end(ab);
+ if (call_panic)
+ audit_panic("error converting sid to string");
+}
+
+/**
+ * audit_free - free a per-task audit context
+ * @tsk: task whose audit context block to free
+ *
+ * Called from copy_process and do_exit
+ */
+void audit_free(struct task_struct *tsk)
+{
+ struct audit_context *context;
+
+ context = audit_get_context(tsk, 0, 0);
+ if (likely(!context))
+ return;
+
+ /* Check for system calls that do not go through the exit
+ * function (e.g., exit_group), then free context block.
+ * We use GFP_ATOMIC here because we might be doing this
+ * in the context of the idle thread */
+ /* that can happen only if we are called from do_exit() */
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context, tsk);
+
+ audit_free_context(context);
+}
+
+/**
+ * audit_syscall_entry - fill in an audit record at syscall entry
+ * @arch: architecture type
+ * @major: major syscall type (function)
+ * @a1: additional syscall register 1
+ * @a2: additional syscall register 2
+ * @a3: additional syscall register 3
+ * @a4: additional syscall register 4
+ *
+ * Fill in audit context at syscall entry. This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built. If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written).
+ */
+void audit_syscall_entry(int arch, int major,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4)
+{
+ struct task_struct *tsk = current;
+ struct audit_context *context = tsk->audit_context;
+ enum audit_state state;
+
+ if (unlikely(!context))
+ return;
+
+ /*
+ * This happens only on certain architectures that make system
+ * calls in kernel_thread via the entry.S interface, instead of
+ * with direct calls. (If you are porting to a new
+ * architecture, hitting this condition can indicate that you
+ * got the _exit/_leave calls backward in entry.S.)
+ *
+ * i386 no
+ * x86_64 no
+ * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
+ *
+ * This also happens with vm86 emulation in a non-nested manner
+ * (entries without exits), so this case must be caught.
+ */
+ if (context->in_syscall) {
+ struct audit_context *newctx;
+
+#if AUDIT_DEBUG
+ printk(KERN_ERR
+ "audit(:%d) pid=%d in syscall=%d;"
+ " entering syscall=%d\n",
+ context->serial, tsk->pid, context->major, major);
+#endif
+ newctx = audit_alloc_context(context->state);
+ if (newctx) {
+ newctx->previous = context;
+ context = newctx;
+ tsk->audit_context = newctx;
+ } else {
+ /* If we can't alloc a new context, the best we
+ * can do is to leak memory (any pending putname
+ * will be lost). The only other alternative is
+ * to abandon auditing. */
+ audit_zero_context(context, context->state);
+ }
+ }
+ BUG_ON(context->in_syscall || context->name_count);
+
+ if (!audit_enabled)
+ return;
+
+ context->arch = arch;
+ context->major = major;
+ context->argv[0] = a1;
+ context->argv[1] = a2;
+ context->argv[2] = a3;
+ context->argv[3] = a4;
+
+ state = context->state;
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
+ state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
+ if (likely(state == AUDIT_DISABLED))
+ return;
+
+ context->serial = 0;
+ context->ctime = CURRENT_TIME;
+ context->in_syscall = 1;
+ context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
+ context->ppid = 0;
+}
+
+void audit_finish_fork(struct task_struct *child)
+{
+ struct audit_context *ctx = current->audit_context;
+ struct audit_context *p = child->audit_context;
+ if (!p || !ctx || !ctx->auditable)
+ return;
+ p->arch = ctx->arch;
+ p->major = ctx->major;
+ memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+ p->ctime = ctx->ctime;
+ p->dummy = ctx->dummy;
+ p->auditable = ctx->auditable;
+ p->in_syscall = ctx->in_syscall;
+ p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+ p->ppid = current->pid;
+}
+
+/**
+ * audit_syscall_exit - deallocate audit context after a system call
+ * @valid: success/failure flag
+ * @return_code: syscall return value
+ *
+ * Tear down after system call. If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information. In call cases,
+ * free the names stored from getname().
+ */
+void audit_syscall_exit(int valid, long return_code)
+{
+ struct task_struct *tsk = current;
+ struct audit_context *context;
+
+ context = audit_get_context(tsk, valid, return_code);
+
+ if (likely(!context))
+ return;
+
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context, tsk);
+
+ context->in_syscall = 0;
+ context->auditable = 0;
+
+ if (context->previous) {
+ struct audit_context *new_context = context->previous;
+ context->previous = NULL;
+ audit_free_context(context);
+ tsk->audit_context = new_context;
+ } else {
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ kfree(context->filterkey);
+ context->filterkey = NULL;
+ tsk->audit_context = context;
+ }
+}
+
+static inline void handle_one(const struct inode *inode)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_context *context;
+ struct audit_tree_refs *p;
+ struct audit_chunk *chunk;
+ int count;
+ if (likely(list_empty(&inode->inotify_watches)))
+ return;
+ context = current->audit_context;
+ p = context->trees;
+ count = context->tree_count;
+ rcu_read_lock();
+ chunk = audit_tree_lookup(inode);
+ rcu_read_unlock();
+ if (!chunk)
+ return;
+ if (likely(put_tree_ref(context, chunk)))
+ return;
+ if (unlikely(!grow_tree_refs(context))) {
+ printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ audit_set_auditable(context);
+ audit_put_chunk(chunk);
+ unroll_tree_refs(context, p, count);
+ return;
+ }
+ put_tree_ref(context, chunk);
+#endif
+}
+
+static void handle_path(const struct dentry *dentry)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_context *context;
+ struct audit_tree_refs *p;
+ const struct dentry *d, *parent;
+ struct audit_chunk *drop;
+ unsigned long seq;
+ int count;
+
+ context = current->audit_context;
+ p = context->trees;
+ count = context->tree_count;
+retry:
+ drop = NULL;
+ d = dentry;
+ rcu_read_lock();
+ seq = read_seqbegin(&rename_lock);
+ for(;;) {
+ struct inode *inode = d->d_inode;
+ if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+ struct audit_chunk *chunk;
+ chunk = audit_tree_lookup(inode);
+ if (chunk) {
+ if (unlikely(!put_tree_ref(context, chunk))) {
+ drop = chunk;
+ break;
+ }
+ }
+ }
+ parent = d->d_parent;
+ if (parent == d)
+ break;
+ d = parent;
+ }
+ if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */
+ rcu_read_unlock();
+ if (!drop) {
+ /* just a race with rename */
+ unroll_tree_refs(context, p, count);
+ goto retry;
+ }
+ audit_put_chunk(drop);
+ if (grow_tree_refs(context)) {
+ /* OK, got more space */
+ unroll_tree_refs(context, p, count);
+ goto retry;
+ }
+ /* too bad */
+ printk(KERN_WARNING
+ "out of memory, audit has lost a tree reference\n");
+ unroll_tree_refs(context, p, count);
+ audit_set_auditable(context);
+ return;
+ }
+ rcu_read_unlock();
+#endif
+}
+
+/**
+ * audit_getname - add a name to the list
+ * @name: name to add
+ *
+ * Add a name to the list of audit names for this context.
+ * Called from fs/namei.c:getname().
+ */
+void __audit_getname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (IS_ERR(name) || !name)
+ return;
+
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ dump_stack();
+#endif
+ return;
+ }
+ BUG_ON(context->name_count >= AUDIT_NAMES);
+ context->names[context->name_count].name = name;
+ context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+ context->names[context->name_count].name_put = 1;
+ context->names[context->name_count].ino = (unsigned long)-1;
+ context->names[context->name_count].osid = 0;
+ ++context->name_count;
+ if (!context->pwd.dentry) {
+ read_lock(&current->fs->lock);
+ context->pwd = current->fs->pwd;
+ path_get(&current->fs->pwd);
+ read_unlock(&current->fs->lock);
+ }
+
+}
+
+/* audit_putname - intercept a putname request
+ * @name: name to intercept and delay for putname
+ *
+ * If we have stored the name from getname in the audit context,
+ * then we delay the putname until syscall exit.
+ * Called from include/linux/fs.h:putname().
+ */
+void audit_putname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ BUG_ON(!context);
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ if (context->name_count) {
+ int i;
+ for (i = 0; i < context->name_count; i++)
+ printk(KERN_ERR "name[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name ?: "(null)");
+ }
+#endif
+ __putname(name);
+ }
+#if AUDIT_DEBUG
+ else {
+ ++context->put_count;
+ if (context->put_count > context->name_count) {
+ printk(KERN_ERR "%s:%d(:%d): major=%d"
+ " in_syscall=%d putname(%p) name_count=%d"
+ " put_count=%d\n",
+ __FILE__, __LINE__,
+ context->serial, context->major,
+ context->in_syscall, name, context->name_count,
+ context->put_count);
+ dump_stack();
+ }
+ }
+#endif
+}
+
+static int audit_inc_name_count(struct audit_context *context,
+ const struct inode *inode)
+{
+ if (context->name_count >= AUDIT_NAMES) {
+ if (inode)
+ printk(KERN_DEBUG "name_count maxed, losing inode data: "
+ "dev=%02x:%02x, inode=%lu\n",
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev),
+ inode->i_ino);
+
+ else
+ printk(KERN_DEBUG "name_count maxed, losing inode data\n");
+ return 1;
+ }
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+}
+
+/**
+ * audit_inode - store the inode and device from a lookup
+ * @name: name being audited
+ * @dentry: dentry being audited
+ *
+ * Called from fs/namei.c:path_lookup().
+ */
+void __audit_inode(const char *name, const struct dentry *dentry)
+{
+ int idx;
+ struct audit_context *context = current->audit_context;
+ const struct inode *inode = dentry->d_inode;
+
+ if (!context->in_syscall)
+ return;
+ if (context->name_count
+ && context->names[context->name_count-1].name
+ && context->names[context->name_count-1].name == name)
+ idx = context->name_count - 1;
+ else if (context->name_count > 1
+ && context->names[context->name_count-2].name
+ && context->names[context->name_count-2].name == name)
+ idx = context->name_count - 2;
+ else {
+ /* FIXME: how much do we care about inodes that have no
+ * associated name? */
+ if (audit_inc_name_count(context, inode))
+ return;
+ idx = context->name_count - 1;
+ context->names[idx].name = NULL;
+ }
+ handle_path(dentry);
+ audit_copy_inode(&context->names[idx], inode);
+}
+
+/**
+ * audit_inode_child - collect inode info for created/removed objects
+ * @dname: inode's dentry name
+ * @dentry: dentry being audited
+ * @parent: inode of dentry parent
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created. Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const char *dname, const struct dentry *dentry,
+ const struct inode *parent)
+{
+ int idx;
+ struct audit_context *context = current->audit_context;
+ const char *found_parent = NULL, *found_child = NULL;
+ const struct inode *inode = dentry->d_inode;
+ int dirlen = 0;
+
+ if (!context->in_syscall)
+ return;
+
+ if (inode)
+ handle_one(inode);
+ /* determine matching parent */
+ if (!dname)
+ goto add_names;
+
+ /* parent is more likely, look for it first */
+ for (idx = 0; idx < context->name_count; idx++) {
+ struct audit_names *n = &context->names[idx];
+
+ if (!n->name)
+ continue;
+
+ if (n->ino == parent->i_ino &&
+ !audit_compare_dname_path(dname, n->name, &dirlen)) {
+ n->name_len = dirlen; /* update parent data in place */
+ found_parent = n->name;
+ goto add_names;
+ }
+ }
+
+ /* no matching parent, look for matching child */
+ for (idx = 0; idx < context->name_count; idx++) {
+ struct audit_names *n = &context->names[idx];
+
+ if (!n->name)
+ continue;
+
+ /* strcmp() is the more likely scenario */
+ if (!strcmp(dname, n->name) ||
+ !audit_compare_dname_path(dname, n->name, &dirlen)) {
+ if (inode)
+ audit_copy_inode(n, inode);
+ else
+ n->ino = (unsigned long)-1;
+ found_child = n->name;
+ goto add_names;
+ }
+ }
+
+add_names:
+ if (!found_parent) {
+ if (audit_inc_name_count(context, parent))
+ return;
+ idx = context->name_count - 1;
+ context->names[idx].name = NULL;
+ audit_copy_inode(&context->names[idx], parent);
+ }
+
+ if (!found_child) {
+ if (audit_inc_name_count(context, inode))
+ return;
+ idx = context->name_count - 1;
+
+ /* Re-use the name belonging to the slot for a matching parent
+ * directory. All names for this context are relinquished in
+ * audit_free_names() */
+ if (found_parent) {
+ context->names[idx].name = found_parent;
+ context->names[idx].name_len = AUDIT_NAME_FULL;
+ /* don't call __putname() */
+ context->names[idx].name_put = 0;
+ } else {
+ context->names[idx].name = NULL;
+ }
+
+ if (inode)
+ audit_copy_inode(&context->names[idx], inode);
+ else
+ context->names[idx].ino = (unsigned long)-1;
+ }
+}
+EXPORT_SYMBOL_GPL(__audit_inode_child);
+
+/**
+ * auditsc_get_stamp - get local copies of audit_context values
+ * @ctx: audit_context for the task
+ * @t: timespec to store time recorded in the audit_context
+ * @serial: serial value that is recorded in the audit_context
+ *
+ * Also sets the context as auditable.
+ */
+int auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (!ctx->in_syscall)
+ return 0;
+ if (!ctx->serial)
+ ctx->serial = audit_serial();
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ ctx->auditable = 1;
+ return 1;
+}
+
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+/**
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+{
+ unsigned int sessionid = atomic_inc_return(&session_id);
+ struct audit_context *context = task->audit_context;
+
+ if (context && context->in_syscall) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (ab) {
+ audit_log_format(ab, "login pid=%d uid=%u "
+ "old auid=%u new auid=%u"
+ " old ses=%u new ses=%u",
+ task->pid, task->uid,
+ task->loginuid, loginuid,
+ task->sessionid, sessionid);
+ audit_log_end(ab);
+ }
+ }
+ task->sessionid = sessionid;
+ task->loginuid = loginuid;
+ return 0;
+}
+
+/**
+ * __audit_mq_open - record audit data for a POSIX MQ open
+ * @oflag: open flag
+ * @mode: mode bits
+ * @u_attr: queue attributes
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+ struct audit_aux_data_mq_open *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_attr != NULL) {
+ if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->attr, 0, sizeof(ax->attr));
+
+ ax->oflag = oflag;
+ ax->mode = mode;
+
+ ax->d.type = AUDIT_MQ_OPEN;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+ const struct timespec __user *u_abs_timeout)
+{
+ struct audit_aux_data_mq_sendrecv *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_abs_timeout != NULL) {
+ if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+ ax->mqdes = mqdes;
+ ax->msg_len = msg_len;
+ ax->msg_prio = msg_prio;
+
+ ax->d.type = AUDIT_MQ_SENDRECV;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @u_msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
+ unsigned int __user *u_msg_prio,
+ const struct timespec __user *u_abs_timeout)
+{
+ struct audit_aux_data_mq_sendrecv *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_msg_prio != NULL) {
+ if (get_user(ax->msg_prio, u_msg_prio)) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ ax->msg_prio = 0;
+
+ if (u_abs_timeout != NULL) {
+ if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+ ax->mqdes = mqdes;
+ ax->msg_len = msg_len;
+
+ ax->d.type = AUDIT_MQ_SENDRECV;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_notify - record audit data for a POSIX MQ notify
+ * @mqdes: MQ descriptor
+ * @u_notification: Notification event
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+
+int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+ struct audit_aux_data_mq_notify *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_notification != NULL) {
+ if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->notification, 0, sizeof(ax->notification));
+
+ ax->mqdes = mqdes;
+
+ ax->d.type = AUDIT_MQ_NOTIFY;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
+ * @mqdes: MQ descriptor
+ * @mqstat: MQ flags
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+ struct audit_aux_data_mq_getsetattr *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->mqdes = mqdes;
+ ax->mqstat = *mqstat;
+
+ ax->d.type = AUDIT_MQ_GETSETATTR;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+ struct audit_aux_data_ipcctl *ax;
+ struct audit_context *context = current->audit_context;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->uid = ipcp->uid;
+ ax->gid = ipcp->gid;
+ ax->mode = ipcp->mode;
+ security_ipc_getsecid(ipcp, &ax->osid);
+ ax->d.type = AUDIT_IPC;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * audit_ipc_set_perm - record audit data for new ipc permissions
+ * @qbytes: msgq bytes
+ * @uid: msgq user id
+ * @gid: msgq group id
+ * @mode: msgq mode (permissions)
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct audit_aux_data_ipcctl *ax;
+ struct audit_context *context = current->audit_context;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->qbytes = qbytes;
+ ax->uid = uid;
+ ax->gid = gid;
+ ax->mode = mode;
+
+ ax->d.type = AUDIT_IPC_SET_PERM;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+int audit_bprm(struct linux_binprm *bprm)
+{
+ struct audit_aux_data_execve *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!audit_enabled || !context || context->dummy))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->argc = bprm->argc;
+ ax->envc = bprm->envc;
+ ax->mm = bprm->mm;
+ ax->d.type = AUDIT_EXECVE;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+
+/**
+ * audit_socketcall - record audit data for sys_socketcall
+ * @nargs: number of args
+ * @args: args array
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_socketcall(int nargs, unsigned long *args)
+{
+ struct audit_aux_data_socketcall *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context || context->dummy))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->nargs = nargs;
+ memcpy(ax->args, args, nargs * sizeof(unsigned long));
+
+ ax->d.type = AUDIT_SOCKETCALL;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_fd_pair - record audit data for pipe and socketpair
+ * @fd1: the first file descriptor
+ * @fd2: the second file descriptor
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_fd_pair(int fd1, int fd2)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_aux_data_fd_pair *ax;
+
+ if (likely(!context)) {
+ return 0;
+ }
+
+ ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ if (!ax) {
+ return -ENOMEM;
+ }
+
+ ax->fd[0] = fd1;
+ ax->fd[1] = fd2;
+
+ ax->d.type = AUDIT_FD_PAIR;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * @len: data length in user space
+ * @a: data address in kernel space
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_sockaddr(int len, void *a)
+{
+ struct audit_aux_data_sockaddr *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context || context->dummy))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->len = len;
+ memcpy(ax->a, a, len);
+
+ ax->d.type = AUDIT_SOCKADDR;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+void __audit_ptrace(struct task_struct *t)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->target_pid = t->pid;
+ context->target_auid = audit_get_loginuid(t);
+ context->target_uid = t->uid;
+ context->target_sessionid = audit_get_sessionid(t);
+ security_task_getsecid(t, &context->target_sid);
+ memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
+}
+
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int __audit_signal_info(int sig, struct task_struct *t)
+{
+ struct audit_aux_data_pids *axp;
+ struct task_struct *tsk = current;
+ struct audit_context *ctx = tsk->audit_context;
+
+ if (audit_pid && t->tgid == audit_pid) {
+ if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
+ audit_sig_pid = tsk->pid;
+ if (tsk->loginuid != -1)
+ audit_sig_uid = tsk->loginuid;
+ else
+ audit_sig_uid = tsk->uid;
+ security_task_getsecid(tsk, &audit_sig_sid);
+ }
+ if (!audit_signals || audit_dummy_context())
+ return 0;
+ }
+
+ /* optimize the common case by putting first signal recipient directly
+ * in audit_context */
+ if (!ctx->target_pid) {
+ ctx->target_pid = t->tgid;
+ ctx->target_auid = audit_get_loginuid(t);
+ ctx->target_uid = t->uid;
+ ctx->target_sessionid = audit_get_sessionid(t);
+ security_task_getsecid(t, &ctx->target_sid);
+ memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
+ return 0;
+ }
+
+ axp = (void *)ctx->aux_pids;
+ if (!axp || axp->pid_count == AUDIT_AUX_PIDS) {
+ axp = kzalloc(sizeof(*axp), GFP_ATOMIC);
+ if (!axp)
+ return -ENOMEM;
+
+ axp->d.type = AUDIT_OBJ_PID;
+ axp->d.next = ctx->aux_pids;
+ ctx->aux_pids = (void *)axp;
+ }
+ BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
+
+ axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
+ axp->target_uid[axp->pid_count] = t->uid;
+ axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
+ security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
+ axp->pid_count++;
+
+ return 0;
+}
+
+/**
+ * audit_core_dumps - record information about processes that end abnormally
+ * @signr: signal value
+ *
+ * If a process ends with a core dump, something fishy is going on and we
+ * should record the event for investigation.
+ */
+void audit_core_dumps(long signr)
+{
+ struct audit_buffer *ab;
+ u32 sid;
+ uid_t auid = audit_get_loginuid(current);
+ unsigned int sessionid = audit_get_sessionid(current);
+
+ if (!audit_enabled)
+ return;
+
+ if (signr == SIGQUIT) /* don't care for those */
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ auid, current->uid, current->gid, sessionid);
+ security_task_getsecid(current, &sid);
+ if (sid) {
+ char *ctx = NULL;
+ u32 len;
+
+ if (security_secid_to_secctx(sid, &ctx, &len))
+ audit_log_format(ab, " ssid=%u", sid);
+ else {
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_untrustedstring(ab, current->comm);
+ audit_log_format(ab, " sig=%ld", signr);
+ audit_log_end(ab);
+}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 0000000..a5e026b
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,91 @@
+/*
+ * Simple stack backtrace regression test module
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+static void backtrace_test_normal(void)
+{
+ printk("Testing a backtrace from process context.\n");
+ printk("The following trace is a kernel self test and not a bug!\n");
+
+ dump_stack();
+}
+
+static DECLARE_COMPLETION(backtrace_work);
+
+static void backtrace_test_irq_callback(unsigned long data)
+{
+ dump_stack();
+ complete(&backtrace_work);
+}
+
+static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+
+static void backtrace_test_irq(void)
+{
+ printk("Testing a backtrace from irq context.\n");
+ printk("The following trace is a kernel self test and not a bug!\n");
+
+ init_completion(&backtrace_work);
+ tasklet_schedule(&backtrace_tasklet);
+ wait_for_completion(&backtrace_work);
+}
+
+#ifdef CONFIG_STACKTRACE
+static void backtrace_test_saved(void)
+{
+ struct stack_trace trace;
+ unsigned long entries[8];
+
+ printk("Testing a saved backtrace.\n");
+ printk("The following trace is a kernel self test and not a bug!\n");
+
+ trace.nr_entries = 0;
+ trace.max_entries = ARRAY_SIZE(entries);
+ trace.entries = entries;
+ trace.skip = 0;
+
+ save_stack_trace(&trace);
+ print_stack_trace(&trace, 0);
+}
+#else
+static void backtrace_test_saved(void)
+{
+ printk("Saved backtrace test skipped.\n");
+}
+#endif
+
+static int backtrace_regression_test(void)
+{
+ printk("====[ backtrace testing ]===========\n");
+
+ backtrace_test_normal();
+ backtrace_test_irq();
+ backtrace_test_saved();
+
+ printk("====[ end of backtrace testing ]====\n");
+ return 0;
+}
+
+static void exitf(void)
+{
+}
+
+module_init(backtrace_regression_test);
+module_exit(exitf);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 0000000..3c53013
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,19 @@
+/*
+ * Generate definitions needed by the preprocessor.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#define __GENERATING_BOUNDS_H
+/* Include headers that define the enum constants of interest */
+#include <linux/page-flags.h>
+#include <linux/mmzone.h>
+#include <linux/kbuild.h>
+
+void foo(void)
+{
+ /* The enum constants to put into include/linux/bounds.h */
+ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
+ DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ /* End of constants */
+}
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 0000000..6ec8359
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,507 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
+ *
+ * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
+ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/pid_namespace.h>
+#include <asm/uaccess.h>
+
+/*
+ * This lock protects task->cap_* for all tasks including current.
+ * Locking rule: acquire this prior to tasklist_lock.
+ */
+static DEFINE_SPINLOCK(task_capability_lock);
+
+/*
+ * Leveraged for setting/resetting capabilities
+ */
+
+const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
+const kernel_cap_t __cap_full_set = CAP_FULL_SET;
+const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
+
+EXPORT_SYMBOL(__cap_empty_set);
+EXPORT_SYMBOL(__cap_full_set);
+EXPORT_SYMBOL(__cap_init_eff_set);
+
+/*
+ * More recent versions of libcap are available from:
+ *
+ * http://www.kernel.org/pub/linux/libs/security/linux-privs/
+ */
+
+static void warn_legacy_capability_use(void)
+{
+ static int warned;
+ if (!warned) {
+ char name[sizeof(current->comm)];
+
+ printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
+ " (legacy support in use)\n",
+ get_task_comm(name, current));
+ warned = 1;
+ }
+}
+
+/*
+ * Version 2 capabilities worked fine, but the linux/capability.h file
+ * that accompanied their introduction encouraged their use without
+ * the necessary user-space source code changes. As such, we have
+ * created a version 3 with equivalent functionality to version 2, but
+ * with a header change to protect legacy source code from using
+ * version 2 when it wanted to use version 1. If your system has code
+ * that trips the following warning, it is using version 2 specific
+ * capabilities and may be doing so insecurely.
+ *
+ * The remedy is to either upgrade your version of libcap (to 2.10+,
+ * if the application is linked against it), or recompile your
+ * application with modern kernel headers and this warning will go
+ * away.
+ */
+
+static void warn_deprecated_v2(void)
+{
+ static int warned;
+
+ if (!warned) {
+ char name[sizeof(current->comm)];
+
+ printk(KERN_INFO "warning: `%s' uses deprecated v2"
+ " capabilities in a way that may be insecure.\n",
+ get_task_comm(name, current));
+ warned = 1;
+ }
+}
+
+/*
+ * Version check. Return the number of u32s in each capability flag
+ * array, or a negative value on error.
+ */
+static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
+{
+ __u32 version;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ switch (version) {
+ case _LINUX_CAPABILITY_VERSION_1:
+ warn_legacy_capability_use();
+ *tocopy = _LINUX_CAPABILITY_U32S_1;
+ break;
+ case _LINUX_CAPABILITY_VERSION_2:
+ warn_deprecated_v2();
+ /*
+ * fall through - v3 is otherwise equivalent to v2.
+ */
+ case _LINUX_CAPABILITY_VERSION_3:
+ *tocopy = _LINUX_CAPABILITY_U32S_3;
+ break;
+ default:
+ if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+ kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+ struct task_struct *target;
+ int ret;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ if (pid && pid != task_pid_vnr(current)) {
+ target = find_task_by_vpid(pid);
+ if (!target) {
+ ret = -ESRCH;
+ goto out;
+ }
+ } else
+ target = current;
+
+ ret = security_capget(target, pEp, pIp, pPp);
+
+out:
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group. We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ struct task_struct *g, *target;
+ int ret = -EPERM;
+ int found = 0;
+ struct pid *pgrp;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ pgrp = find_vpid(pgrp_nr);
+ do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+ target = g;
+ while_each_thread(g, target) {
+ if (!security_capset_check(target, effective,
+ inheritable, permitted)) {
+ security_capset_set(target, effective,
+ inheritable, permitted);
+ ret = 0;
+ }
+ found = 1;
+ }
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ if (!found)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * cap_set_all - set capabilities for all processes other than init
+ * and self. We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_all(kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ struct task_struct *g, *target;
+ int ret = -EPERM;
+ int found = 0;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ do_each_thread(g, target) {
+ if (target == current
+ || is_container_init(target->group_leader))
+ continue;
+ found = 1;
+ if (security_capset_check(target, effective, inheritable,
+ permitted))
+ continue;
+ ret = 0;
+ security_capset_set(target, effective, inheritable, permitted);
+ } while_each_thread(g, target);
+
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ if (!found)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ struct task_struct *target;
+ int ret;
+
+ if (!capable(CAP_SETPCAP))
+ return -EPERM;
+
+ if (pid == -1) /* all procs other than current and init */
+ return cap_set_all(effective, inheritable, permitted);
+
+ else if (pid < 0) /* all procs in process group */
+ return cap_set_pg(-pid, effective, inheritable, permitted);
+
+ /* target != current */
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ target = find_task_by_vpid(pid);
+ if (!target)
+ ret = -ESRCH;
+ else {
+ ret = security_capset_check(target, effective, inheritable,
+ permitted);
+
+ /* having verified that the proposed changes are legal,
+ we now put them into effect. */
+ if (!ret)
+ security_capset_set(target, effective, inheritable,
+ permitted);
+ }
+
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ return ret;
+}
+
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+ kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+ int ret;
+
+ if (pid && (pid != task_pid_vnr(current))) {
+ struct task_struct *target;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ target = find_task_by_vpid(pid);
+ if (!target)
+ ret = -ESRCH;
+ else
+ ret = security_capget(target, pEp, pIp, pPp);
+
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+ } else
+ ret = security_capget(current, pEp, pIp, pPp);
+
+ return ret;
+}
+
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+ kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ return -EPERM;
+}
+
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * Atomically modify the effective capabilities returning the original
+ * value. No permission check is performed here - it is assumed that the
+ * caller is permitted to set the desired effective capabilities.
+ */
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
+{
+ kernel_cap_t pE_old;
+
+ spin_lock(&task_capability_lock);
+
+ pE_old = current->cap_effective;
+ current->cap_effective = pE_new;
+
+ spin_unlock(&task_capability_lock);
+
+ return pE_old;
+}
+
+EXPORT_SYMBOL(cap_set_effective);
+
+/**
+ * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
+{
+ int ret = 0;
+ pid_t pid;
+ unsigned tocopy;
+ kernel_cap_t pE, pI, pP;
+
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ ret = cap_get_target_pid(pid, &pE, &pI, &pP);
+
+ if (!ret) {
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
+ unsigned i;
+
+ for (i = 0; i < tocopy; i++) {
+ kdata[i].effective = pE.cap[i];
+ kdata[i].permitted = pP.cap[i];
+ kdata[i].inheritable = pI.cap[i];
+ }
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectidly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy
+ * sizeof(struct __user_cap_data_struct))) {
+ return -EFAULT;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * sys_capset - set capabilities for a process or (*) a group of processes
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities
+ *
+ * Set capabilities for a given process, all processes, or all
+ * processes in a given process group.
+ *
+ * The restrictions on setting capabilities are specified as:
+ *
+ * [pid is for the 'target' task. 'current' is the calling task.]
+ *
+ * I: any raised capabilities must be a subset of the (old current) permitted
+ * P: any raised capabilities must be a subset of the (old current) permitted
+ * E: must be set to a subset of (new target) permitted
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
+{
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
+ unsigned i, tocopy;
+ kernel_cap_t inheritable, permitted, effective;
+ int ret;
+ pid_t pid;
+
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (copy_from_user(&kdata, data, tocopy
+ * sizeof(struct __user_cap_data_struct))) {
+ return -EFAULT;
+ }
+
+ for (i = 0; i < tocopy; i++) {
+ effective.cap[i] = kdata[i].effective;
+ permitted.cap[i] = kdata[i].permitted;
+ inheritable.cap[i] = kdata[i].inheritable;
+ }
+ while (i < _KERNEL_CAPABILITY_U32S) {
+ effective.cap[i] = 0;
+ permitted.cap[i] = 0;
+ inheritable.cap[i] = 0;
+ i++;
+ }
+
+ if (pid && (pid != task_pid_vnr(current)))
+ ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+ &permitted);
+ else {
+ /*
+ * This lock is required even when filesystem
+ * capability support is configured - it protects the
+ * sys_capget() call from returning incorrect data in
+ * the case that the targeted process is not the
+ * current one.
+ */
+ spin_lock(&task_capability_lock);
+
+ ret = security_capset_check(current, &effective, &inheritable,
+ &permitted);
+ /*
+ * Having verified that the proposed changes are
+ * legal, we now put them into effect.
+ */
+ if (!ret)
+ security_capset_set(current, &effective, &inheritable,
+ &permitted);
+ spin_unlock(&task_capability_lock);
+ }
+
+
+ return ret;
+}
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable(int cap)
+{
+ if (has_capability(current, cap)) {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 0000000..c882385
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,3184 @@
+/*
+ * Generic process-grouping system.
+ *
+ * Based originally on the cpuset system, extracted by Paul Menage
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Copyright notices from the original cpuset code:
+ * --------------------------------------------------
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * ---------------------------------------------------
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/backing-dev.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
+#include <linux/hash.h>
+#include <linux/namei.h>
+
+#include <asm/atomic.h>
+
+static DEFINE_MUTEX(cgroup_mutex);
+
+/* Generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) &_x ## _subsys,
+
+static struct cgroup_subsys *subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+
+/*
+ * A cgroupfs_root represents the root of a cgroup hierarchy,
+ * and may be associated with a superblock to form an active
+ * hierarchy
+ */
+struct cgroupfs_root {
+ struct super_block *sb;
+
+ /*
+ * The bitmask of subsystems intended to be attached to this
+ * hierarchy
+ */
+ unsigned long subsys_bits;
+
+ /* The bitmask of subsystems currently attached to this hierarchy */
+ unsigned long actual_subsys_bits;
+
+ /* A list running through the attached subsystems */
+ struct list_head subsys_list;
+
+ /* The root cgroup for this hierarchy */
+ struct cgroup top_cgroup;
+
+ /* Tracks how many cgroups are currently defined in hierarchy.*/
+ int number_of_cgroups;
+
+ /* A list running through the mounted hierarchies */
+ struct list_head root_list;
+
+ /* Hierarchy-specific flags */
+ unsigned long flags;
+
+ /* The path to use for release notifications. */
+ char release_agent_path[PATH_MAX];
+};
+
+
+/*
+ * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
+ * subsystems that are otherwise unattached - it never has more than a
+ * single cgroup, and all tasks are part of that cgroup.
+ */
+static struct cgroupfs_root rootnode;
+
+/* The list of hierarchy roots */
+
+static LIST_HEAD(roots);
+static int root_count;
+
+/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
+#define dummytop (&rootnode.top_cgroup)
+
+/* This flag indicates whether tasks in the fork and exit paths should
+ * check for fork/exit handlers to call. This avoids us having to do
+ * extra work in the fork/exit path if none of the subsystems need to
+ * be called.
+ */
+static int need_forkexit_callback __read_mostly;
+static int need_mm_owner_callback __read_mostly;
+
+/* convenient tests for these bits */
+inline int cgroup_is_removed(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_REMOVED, &cgrp->flags);
+}
+
+/* bits in struct cgroupfs_root flags field */
+enum {
+ ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+};
+
+static int cgroup_is_releasable(const struct cgroup *cgrp)
+{
+ const int bits =
+ (1 << CGRP_RELEASABLE) |
+ (1 << CGRP_NOTIFY_ON_RELEASE);
+ return (cgrp->flags & bits) == bits;
+}
+
+static int notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+/*
+ * for_each_subsys() allows you to iterate on each subsystem attached to
+ * an active hierarchy
+ */
+#define for_each_subsys(_root, _ss) \
+list_for_each_entry(_ss, &_root->subsys_list, sibling)
+
+/* for_each_root() allows you to iterate across the active hierarchies */
+#define for_each_root(_root) \
+list_for_each_entry(_root, &roots, root_list)
+
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cgrp);
+
+/* Link structure for associating css_set objects with cgroups */
+struct cg_cgroup_link {
+ /*
+ * List running through cg_cgroup_links associated with a
+ * cgroup, anchored on cgroup->css_sets
+ */
+ struct list_head cgrp_link_list;
+ /*
+ * List running through cg_cgroup_links pointing at a
+ * single css_set object, anchored on css_set->cg_links
+ */
+ struct list_head cg_link_list;
+ struct css_set *cg;
+};
+
+/* The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+
+static struct css_set init_css_set;
+static struct cg_cgroup_link init_css_set_link;
+
+/* css_set_lock protects the list of css_set objects, and the
+ * chain of tasks off each css_set. Nests outside task->alloc_lock
+ * due to cgroup_iter_start() */
+static DEFINE_RWLOCK(css_set_lock);
+static int css_set_count;
+
+/* hash table for cgroup groups. This improves the performance to
+ * find an existing css_set */
+#define CSS_SET_HASH_BITS 7
+#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
+static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+
+static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+{
+ int i;
+ int index;
+ unsigned long tmp = 0UL;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+ tmp += (unsigned long)css[i];
+ tmp = (tmp >> 16) ^ tmp;
+
+ index = hash_long(tmp, CSS_SET_HASH_BITS);
+
+ return &css_set_table[index];
+}
+
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
+
+/* When we create or destroy a css_set, the operation simply
+ * takes/releases a reference count on all the cgroups referenced
+ * by subsystems in this css_set. This can end up multiple-counting
+ * some cgroups, but that's OK - the ref-count is just a
+ * busy/not-busy indicator; ensuring that we only count each cgroup
+ * once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+
+/*
+ * unlink a css_set from the list and free it
+ */
+static void unlink_css_set(struct css_set *cg)
+{
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ hlist_del(&cg->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
+ kfree(link);
+ }
+}
+
+static void __put_css_set(struct css_set *cg, int taskexit)
+{
+ int i;
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cg->refcount, -1, 1))
+ return;
+ write_lock(&css_set_lock);
+ if (!atomic_dec_and_test(&cg->refcount)) {
+ write_unlock(&css_set_lock);
+ return;
+ }
+ unlink_css_set(cg);
+ write_unlock(&css_set_lock);
+
+ rcu_read_lock();
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup *cgrp = cg->subsys[i]->cgroup;
+ if (atomic_dec_and_test(&cgrp->count) &&
+ notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
+ }
+ rcu_read_unlock();
+ kfree(cg);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cg)
+{
+ atomic_inc(&cg->refcount);
+}
+
+static inline void put_css_set(struct css_set *cg)
+{
+ __put_css_set(cg, 0);
+}
+
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+ __put_css_set(cg, 1);
+}
+
+/*
+ * find_existing_css_set() is a helper for
+ * find_css_set(), and checks to see whether an existing
+ * css_set is suitable.
+ *
+ * oldcg: the cgroup group that we're using before the cgroup
+ * transition
+ *
+ * cgrp: the cgroup that we're moving into
+ *
+ * template: location in which to build the desired set of subsystem
+ * state objects for the new cgroup group
+ */
+static struct css_set *find_existing_css_set(
+ struct css_set *oldcg,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ int i;
+ struct cgroupfs_root *root = cgrp->root;
+ struct hlist_head *hhead;
+ struct hlist_node *node;
+ struct css_set *cg;
+
+ /* Built the set of subsystem state objects that we want to
+ * see in the new css_set */
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ if (root->subsys_bits & (1UL << i)) {
+ /* Subsystem is in this hierarchy. So we want
+ * the subsystem state from the new
+ * cgroup */
+ template[i] = cgrp->subsys[i];
+ } else {
+ /* Subsystem is not in this hierarchy, so we
+ * don't want to change the subsystem state */
+ template[i] = oldcg->subsys[i];
+ }
+ }
+
+ hhead = css_set_hash(template);
+ hlist_for_each_entry(cg, node, hhead, hlist) {
+ if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+ /* All subsystems matched */
+ return cg;
+ }
+ }
+
+ /* No existing cgroup group matched */
+ return NULL;
+}
+
+static void free_cg_links(struct list_head *tmp)
+{
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+ list_del(&link->cgrp_link_list);
+ kfree(link);
+ }
+}
+
+/*
+ * allocate_cg_links() allocates "count" cg_cgroup_link structures
+ * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
+ * success or a negative error
+ */
+static int allocate_cg_links(int count, struct list_head *tmp)
+{
+ struct cg_cgroup_link *link;
+ int i;
+ INIT_LIST_HEAD(tmp);
+ for (i = 0; i < count; i++) {
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ free_cg_links(tmp);
+ return -ENOMEM;
+ }
+ list_add(&link->cgrp_link_list, tmp);
+ }
+ return 0;
+}
+
+/*
+ * find_css_set() takes an existing cgroup group and a
+ * cgroup object, and returns a css_set object that's
+ * equivalent to the old group, but with the given cgroup
+ * substituted into the appropriate hierarchy. Must be called with
+ * cgroup_mutex held
+ */
+static struct css_set *find_css_set(
+ struct css_set *oldcg, struct cgroup *cgrp)
+{
+ struct css_set *res;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ int i;
+
+ struct list_head tmp_cg_links;
+ struct cg_cgroup_link *link;
+
+ struct hlist_head *hhead;
+
+ /* First see if we already have a cgroup group that matches
+ * the desired set */
+ read_lock(&css_set_lock);
+ res = find_existing_css_set(oldcg, cgrp, template);
+ if (res)
+ get_css_set(res);
+ read_unlock(&css_set_lock);
+
+ if (res)
+ return res;
+
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return NULL;
+
+ /* Allocate all the cg_cgroup_link objects that we'll need */
+ if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
+ kfree(res);
+ return NULL;
+ }
+
+ atomic_set(&res->refcount, 1);
+ INIT_LIST_HEAD(&res->cg_links);
+ INIT_LIST_HEAD(&res->tasks);
+ INIT_HLIST_NODE(&res->hlist);
+
+ /* Copy the set of subsystem state objects generated in
+ * find_existing_css_set() */
+ memcpy(res->subsys, template, sizeof(res->subsys));
+
+ write_lock(&css_set_lock);
+ /* Add reference counts and links from the new css_set. */
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup *cgrp = res->subsys[i]->cgroup;
+ struct cgroup_subsys *ss = subsys[i];
+ atomic_inc(&cgrp->count);
+ /*
+ * We want to add a link once per cgroup, so we
+ * only do it for the first subsystem in each
+ * hierarchy
+ */
+ if (ss->root->subsys_list.next == &ss->sibling) {
+ BUG_ON(list_empty(&tmp_cg_links));
+ link = list_entry(tmp_cg_links.next,
+ struct cg_cgroup_link,
+ cgrp_link_list);
+ list_del(&link->cgrp_link_list);
+ list_add(&link->cgrp_link_list, &cgrp->css_sets);
+ link->cg = res;
+ list_add(&link->cg_link_list, &res->cg_links);
+ }
+ }
+ if (list_empty(&rootnode.subsys_list)) {
+ link = list_entry(tmp_cg_links.next,
+ struct cg_cgroup_link,
+ cgrp_link_list);
+ list_del(&link->cgrp_link_list);
+ list_add(&link->cgrp_link_list, &dummytop->css_sets);
+ link->cg = res;
+ list_add(&link->cg_link_list, &res->cg_links);
+ }
+
+ BUG_ON(!list_empty(&tmp_cg_links));
+
+ css_set_count++;
+
+ /* Add this cgroup group to the hash table */
+ hhead = css_set_hash(res->subsys);
+ hlist_add_head(&res->hlist, hhead);
+
+ write_unlock(&css_set_lock);
+
+ return res;
+}
+
+/*
+ * There is one global cgroup mutex. We also require taking
+ * task_lock() when dereferencing a task's cgroup subsys pointers.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing. However, if the count goes to zero, then only
+ * cgroup_attach_task() can increment it again. Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count). So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
+ * (usually) take cgroup_mutex. These are the two most performance
+ * critical pieces of code here. The exception occurs on cgroup_exit(),
+ * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
+ * is taken, and if the cgroup count is zero, a usermode call made
+ * to the release agent with the name of the cgroup (path relative to
+ * the root of cgroup file system) as the argument.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty. Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * always has either children cgroups and/or using tasks. So we don't
+ * need a special hack to ensure that top_cgroup cannot be deleted.
+ *
+ * The task_lock() exception
+ *
+ * The need for this exception arises from the action of
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * another. It does so using cgroup_mutex, however there are
+ * several performance critical places that need to reference
+ * task->cgroup without the expense of grabbing a system global
+ * mutex. Therefore except as noted below, when dereferencing or, as
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * task_lock(), which acts on a spinlock (task->alloc_lock) already in
+ * the task_struct routinely used for such matters.
+ *
+ * P.S. One more locking exception. RCU is used to guard the
+ * update of a tasks cgroup pointer by cgroup_attach_task()
+ */
+
+/**
+ * cgroup_lock - lock out any changes to cgroup structures
+ *
+ */
+void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+/**
+ * cgroup_unlock - release lock on cgroup changes
+ *
+ * Undo the lock taken in a previous cgroup_lock() call.
+ */
+void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
+/*
+ * A couple of forward declarations required, due to cyclic reference loop:
+ * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
+ * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
+ * -> cgroup_mkdir.
+ */
+
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
+static int cgroup_populate_dir(struct cgroup *cgrp);
+static struct inode_operations cgroup_dir_inode_operations;
+static struct file_operations proc_cgroupstats_operations;
+
+static struct backing_dev_info cgroup_backing_dev_info = {
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+{
+ struct inode *inode = new_inode(sb);
+
+ if (inode) {
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
+ }
+ return inode;
+}
+
+/*
+ * Call subsys's pre_destroy handler.
+ * This is called before css refcnt check.
+ */
+static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+{
+ struct cgroup_subsys *ss;
+ for_each_subsys(cgrp->root, ss)
+ if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+ ss->pre_destroy(ss, cgrp);
+ return;
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cgroup */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cgroup *cgrp = dentry->d_fsdata;
+ struct cgroup_subsys *ss;
+ BUG_ON(!(cgroup_is_removed(cgrp)));
+ /* It's possible for external users to be holding css
+ * reference counts on a cgroup; css_put() needs to
+ * be able to access the cgroup after decrementing
+ * the reference count in order to know if it needs to
+ * queue the cgroup to be handled by the release
+ * agent */
+ synchronize_rcu();
+
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss) {
+ if (cgrp->subsys[ss->subsys_id])
+ ss->destroy(ss, cgrp);
+ }
+
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
+
+ /* Drop the active superblock reference that we took when we
+ * created the cgroup */
+ deactivate_super(cgrp->root->sb);
+
+ kfree(cgrp);
+ }
+ iput(inode);
+}
+
+static void remove_dir(struct dentry *d)
+{
+ struct dentry *parent = dget(d->d_parent);
+
+ d_delete(d);
+ simple_rmdir(parent->d_inode, d);
+ dput(parent);
+}
+
+static void cgroup_clear_directory(struct dentry *dentry)
+{
+ struct list_head *node;
+
+ BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ spin_lock(&dcache_lock);
+ node = dentry->d_subdirs.next;
+ while (node != &dentry->d_subdirs) {
+ struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+ list_del_init(node);
+ if (d->d_inode) {
+ /* This should never be called on a cgroup
+ * directory with child cgroups */
+ BUG_ON(d->d_inode->i_mode & S_IFDIR);
+ d = dget_locked(d);
+ spin_unlock(&dcache_lock);
+ d_delete(d);
+ simple_unlink(dentry->d_inode, d);
+ dput(d);
+ spin_lock(&dcache_lock);
+ }
+ node = dentry->d_subdirs.next;
+ }
+ spin_unlock(&dcache_lock);
+}
+
+/*
+ * NOTE : the dentry must have been dget()'ed
+ */
+static void cgroup_d_remove_dir(struct dentry *dentry)
+{
+ cgroup_clear_directory(dentry);
+
+ spin_lock(&dcache_lock);
+ list_del_init(&dentry->d_u.d_child);
+ spin_unlock(&dcache_lock);
+ remove_dir(dentry);
+}
+
+static int rebind_subsystems(struct cgroupfs_root *root,
+ unsigned long final_bits)
+{
+ unsigned long added_bits, removed_bits;
+ struct cgroup *cgrp = &root->top_cgroup;
+ int i;
+
+ removed_bits = root->actual_subsys_bits & ~final_bits;
+ added_bits = final_bits & ~root->actual_subsys_bits;
+ /* Check that any added subsystems are currently free */
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ unsigned long bit = 1UL << i;
+ struct cgroup_subsys *ss = subsys[i];
+ if (!(bit & added_bits))
+ continue;
+ if (ss->root != &rootnode) {
+ /* Subsystem isn't free */
+ return -EBUSY;
+ }
+ }
+
+ /* Currently we don't handle adding/removing subsystems when
+ * any child cgroups exist. This is theoretically supportable
+ * but involves complex error handling, so it's being left until
+ * later */
+ if (root->number_of_cgroups > 1)
+ return -EBUSY;
+
+ /* Process each subsystem */
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ unsigned long bit = 1UL << i;
+ if (bit & added_bits) {
+ /* We're binding this subsystem to this hierarchy */
+ BUG_ON(cgrp->subsys[i]);
+ BUG_ON(!dummytop->subsys[i]);
+ BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+ cgrp->subsys[i] = dummytop->subsys[i];
+ cgrp->subsys[i]->cgroup = cgrp;
+ list_add(&ss->sibling, &root->subsys_list);
+ rcu_assign_pointer(ss->root, root);
+ if (ss->bind)
+ ss->bind(ss, cgrp);
+
+ } else if (bit & removed_bits) {
+ /* We're removing this subsystem */
+ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
+ BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+ if (ss->bind)
+ ss->bind(ss, dummytop);
+ dummytop->subsys[i]->cgroup = dummytop;
+ cgrp->subsys[i] = NULL;
+ rcu_assign_pointer(subsys[i]->root, &rootnode);
+ list_del(&ss->sibling);
+ } else if (bit & final_bits) {
+ /* Subsystem state should already exist */
+ BUG_ON(!cgrp->subsys[i]);
+ } else {
+ /* Subsystem state shouldn't exist */
+ BUG_ON(cgrp->subsys[i]);
+ }
+ }
+ root->subsys_bits = root->actual_subsys_bits = final_bits;
+ synchronize_rcu();
+
+ return 0;
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+ struct cgroup_subsys *ss;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_subsys(root, ss)
+ seq_printf(seq, ",%s", ss->name);
+ if (test_bit(ROOT_NOPREFIX, &root->flags))
+ seq_puts(seq, ",noprefix");
+ if (strlen(root->release_agent_path))
+ seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+struct cgroup_sb_opts {
+ unsigned long subsys_bits;
+ unsigned long flags;
+ char *release_agent;
+};
+
+/* Convert a hierarchy specifier into a bitmask of subsystems and
+ * flags. */
+static int parse_cgroupfs_options(char *data,
+ struct cgroup_sb_opts *opts)
+{
+ char *token, *o = data ?: "all";
+
+ opts->subsys_bits = 0;
+ opts->flags = 0;
+ opts->release_agent = NULL;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token)
+ return -EINVAL;
+ if (!strcmp(token, "all")) {
+ /* Add all non-disabled subsystems */
+ int i;
+ opts->subsys_bits = 0;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (!ss->disabled)
+ opts->subsys_bits |= 1ul << i;
+ }
+ } else if (!strcmp(token, "noprefix")) {
+ set_bit(ROOT_NOPREFIX, &opts->flags);
+ } else if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
+ opts->release_agent[PATH_MAX - 1] = 0;
+ } else {
+ struct cgroup_subsys *ss;
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ ss = subsys[i];
+ if (!strcmp(token, ss->name)) {
+ if (!ss->disabled)
+ set_bit(i, &opts->subsys_bits);
+ break;
+ }
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+ }
+
+ /* We can't have an empty hierarchy */
+ if (!opts->subsys_bits)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+{
+ int ret = 0;
+ struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_sb_opts opts;
+
+ mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_lock(&cgroup_mutex);
+
+ /* See what subsystems are wanted */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ /* Don't allow flags to change at remount */
+ if (opts.flags != root->flags) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, opts.subsys_bits);
+
+ /* (re)populate subsystem files */
+ if (!ret)
+ cgroup_populate_dir(cgrp);
+
+ if (opts.release_agent)
+ strcpy(root->release_agent_path, opts.release_agent);
+ out_unlock:
+ if (opts.release_agent)
+ kfree(opts.release_agent);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ return ret;
+}
+
+static struct super_operations cgroup_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .show_options = cgroup_show_options,
+ .remount_fs = cgroup_remount,
+};
+
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+ INIT_LIST_HEAD(&cgrp->sibling);
+ INIT_LIST_HEAD(&cgrp->children);
+ INIT_LIST_HEAD(&cgrp->css_sets);
+ INIT_LIST_HEAD(&cgrp->release_list);
+ init_rwsem(&cgrp->pids_mutex);
+}
+static void init_cgroup_root(struct cgroupfs_root *root)
+{
+ struct cgroup *cgrp = &root->top_cgroup;
+ INIT_LIST_HEAD(&root->subsys_list);
+ INIT_LIST_HEAD(&root->root_list);
+ root->number_of_cgroups = 1;
+ cgrp->root = root;
+ cgrp->top_cgroup = cgrp;
+ init_cgroup_housekeeping(cgrp);
+}
+
+static int cgroup_test_super(struct super_block *sb, void *data)
+{
+ struct cgroupfs_root *new = data;
+ struct cgroupfs_root *root = sb->s_fs_info;
+
+ /* First check subsystems */
+ if (new->subsys_bits != root->subsys_bits)
+ return 0;
+
+ /* Next check flags */
+ if (new->flags != root->flags)
+ return 0;
+
+ return 1;
+}
+
+static int cgroup_set_super(struct super_block *sb, void *data)
+{
+ int ret;
+ struct cgroupfs_root *root = data;
+
+ ret = set_anon_super(sb, NULL);
+ if (ret)
+ return ret;
+
+ sb->s_fs_info = root;
+ root->sb = sb;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = CGROUP_SUPER_MAGIC;
+ sb->s_op = &cgroup_ops;
+
+ return 0;
+}
+
+static int cgroup_get_rootdir(struct super_block *sb)
+{
+ struct inode *inode =
+ cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
+ struct dentry *dentry;
+
+ if (!inode)
+ return -ENOMEM;
+
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &cgroup_dir_inode_operations;
+ /* directories start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ dentry = d_alloc_root(inode);
+ if (!dentry) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ sb->s_root = dentry;
+ return 0;
+}
+
+static int cgroup_get_sb(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ struct cgroup_sb_opts opts;
+ int ret = 0;
+ struct super_block *sb;
+ struct cgroupfs_root *root;
+ struct list_head tmp_cg_links;
+
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret) {
+ if (opts.release_agent)
+ kfree(opts.release_agent);
+ return ret;
+ }
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ if (opts.release_agent)
+ kfree(opts.release_agent);
+ return -ENOMEM;
+ }
+
+ init_cgroup_root(root);
+ root->subsys_bits = opts.subsys_bits;
+ root->flags = opts.flags;
+ if (opts.release_agent) {
+ strcpy(root->release_agent_path, opts.release_agent);
+ kfree(opts.release_agent);
+ }
+
+ sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
+
+ if (IS_ERR(sb)) {
+ kfree(root);
+ return PTR_ERR(sb);
+ }
+
+ if (sb->s_fs_info != root) {
+ /* Reusing an existing superblock */
+ BUG_ON(sb->s_root == NULL);
+ kfree(root);
+ root = NULL;
+ } else {
+ /* New superblock */
+ struct cgroup *cgrp = &root->top_cgroup;
+ struct inode *inode;
+ int i;
+
+ BUG_ON(sb->s_root != NULL);
+
+ ret = cgroup_get_rootdir(sb);
+ if (ret)
+ goto drop_new_super;
+ inode = sb->s_root->d_inode;
+
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&cgroup_mutex);
+
+ /*
+ * We're accessing css_set_count without locking
+ * css_set_lock here, but that's OK - it can only be
+ * increased by someone holding cgroup_lock, and
+ * that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cg_links(css_set_count, &tmp_cg_links);
+ if (ret) {
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
+ goto drop_new_super;
+ }
+
+ ret = rebind_subsystems(root, root->subsys_bits);
+ if (ret == -EBUSY) {
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
+ goto free_cg_links;
+ }
+
+ /* EBUSY should be the only error here */
+ BUG_ON(ret);
+
+ list_add(&root->root_list, &roots);
+ root_count++;
+
+ sb->s_root->d_fsdata = &root->top_cgroup;
+ root->top_cgroup.dentry = sb->s_root;
+
+ /* Link the top cgroup in this hierarchy into all
+ * the css_set objects */
+ write_lock(&css_set_lock);
+ for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+ struct hlist_head *hhead = &css_set_table[i];
+ struct hlist_node *node;
+ struct css_set *cg;
+
+ hlist_for_each_entry(cg, node, hhead, hlist) {
+ struct cg_cgroup_link *link;
+
+ BUG_ON(list_empty(&tmp_cg_links));
+ link = list_entry(tmp_cg_links.next,
+ struct cg_cgroup_link,
+ cgrp_link_list);
+ list_del(&link->cgrp_link_list);
+ link->cg = cg;
+ list_add(&link->cgrp_link_list,
+ &root->top_cgroup.css_sets);
+ list_add(&link->cg_link_list, &cg->cg_links);
+ }
+ }
+ write_unlock(&css_set_lock);
+
+ free_cg_links(&tmp_cg_links);
+
+ BUG_ON(!list_empty(&cgrp->sibling));
+ BUG_ON(!list_empty(&cgrp->children));
+ BUG_ON(root->number_of_cgroups != 1);
+
+ cgroup_populate_dir(cgrp);
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&cgroup_mutex);
+ }
+
+ return simple_set_mnt(mnt, sb);
+
+ free_cg_links:
+ free_cg_links(&tmp_cg_links);
+ drop_new_super:
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+ return ret;
+}
+
+static void cgroup_kill_sb(struct super_block *sb) {
+ struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup *cgrp = &root->top_cgroup;
+ int ret;
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ BUG_ON(!root);
+
+ BUG_ON(root->number_of_cgroups != 1);
+ BUG_ON(!list_empty(&cgrp->children));
+ BUG_ON(!list_empty(&cgrp->sibling));
+
+ mutex_lock(&cgroup_mutex);
+
+ /* Rebind all subsystems back to the default hierarchy */
+ ret = rebind_subsystems(root, 0);
+ /* Shouldn't be able to fail ... */
+ BUG_ON(ret);
+
+ /*
+ * Release all the links from css_sets to this hierarchy's
+ * root cgroup
+ */
+ write_lock(&css_set_lock);
+
+ list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+ cgrp_link_list) {
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
+ kfree(link);
+ }
+ write_unlock(&css_set_lock);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ root_count--;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ kfree(root);
+ kill_litter_super(sb);
+}
+
+static struct file_system_type cgroup_fs_type = {
+ .name = "cgroup",
+ .get_sb = cgroup_get_sb,
+ .kill_sb = cgroup_kill_sb,
+};
+
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+/**
+ * cgroup_path - generate the path of a cgroup
+ * @cgrp: the cgroup in question
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Called with cgroup_mutex held. Writes path of cgroup into buf.
+ * Returns 0 on success, -errno on error.
+ */
+int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+{
+ char *start;
+
+ if (cgrp == dummytop) {
+ /*
+ * Inactive subsystems have no dentry for their root
+ * cgroup
+ */
+ strcpy(buf, "/");
+ return 0;
+ }
+
+ start = buf + buflen;
+
+ *--start = '\0';
+ for (;;) {
+ int len = cgrp->dentry->d_name.len;
+ if ((start -= len) < buf)
+ return -ENAMETOOLONG;
+ memcpy(start, cgrp->dentry->d_name.name, len);
+ cgrp = cgrp->parent;
+ if (!cgrp)
+ break;
+ if (!cgrp->parent)
+ continue;
+ if (--start < buf)
+ return -ENAMETOOLONG;
+ *start = '/';
+ }
+ memmove(buf, start, buf + buflen - start);
+ return 0;
+}
+
+/*
+ * Return the first subsystem attached to a cgroup's hierarchy, and
+ * its subsystem id.
+ */
+
+static void get_first_subsys(const struct cgroup *cgrp,
+ struct cgroup_subsys_state **css, int *subsys_id)
+{
+ const struct cgroupfs_root *root = cgrp->root;
+ const struct cgroup_subsys *test_ss;
+ BUG_ON(list_empty(&root->subsys_list));
+ test_ss = list_entry(root->subsys_list.next,
+ struct cgroup_subsys, sibling);
+ if (css) {
+ *css = cgrp->subsys[test_ss->subsys_id];
+ BUG_ON(!*css);
+ }
+ if (subsys_id)
+ *subsys_id = test_ss->subsys_id;
+}
+
+/**
+ * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
+ * @cgrp: the cgroup the task is attaching to
+ * @tsk: the task to be attached
+ *
+ * Call holding cgroup_mutex. May take task_lock of
+ * the task 'tsk' during call.
+ */
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+ int retval = 0;
+ struct cgroup_subsys *ss;
+ struct cgroup *oldcgrp;
+ struct css_set *cg = tsk->cgroups;
+ struct css_set *newcg;
+ struct cgroupfs_root *root = cgrp->root;
+ int subsys_id;
+
+ get_first_subsys(cgrp, NULL, &subsys_id);
+
+ /* Nothing to do if the task is already in that cgroup */
+ oldcgrp = task_cgroup(tsk, subsys_id);
+ if (cgrp == oldcgrp)
+ return 0;
+
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, tsk);
+ if (retval)
+ return retval;
+ }
+ }
+
+ /*
+ * Locate or allocate a new css_set for this task,
+ * based on its final set of cgroups
+ */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list)) {
+ list_del(&tsk->cg_list);
+ list_add(&tsk->cg_list, &newcg->tasks);
+ }
+ write_unlock(&css_set_lock);
+
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, tsk);
+ }
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ synchronize_rcu();
+ put_css_set(cg);
+ return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * held. May take task_lock of task
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ if (pid) {
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (!tsk || tsk->flags & PF_EXITING) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ if ((current->euid) && (current->euid != tsk->uid)
+ && (current->euid != tsk->suid)) {
+ put_task_struct(tsk);
+ return -EACCES;
+ }
+ } else {
+ tsk = current;
+ get_task_struct(tsk);
+ }
+
+ ret = cgroup_attach_task(cgrp, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+ int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ ret = attach_task_by_pid(cgrp, pid);
+ cgroup_unlock();
+ return ret;
+}
+
+/* The various types of files and directories in a cgroup file system */
+enum cgroup_filetype {
+ FILE_ROOT,
+ FILE_DIR,
+ FILE_TASKLIST,
+ FILE_NOTIFY_ON_RELEASE,
+ FILE_RELEASE_AGENT,
+};
+
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_mutex);
+ if (cgroup_is_removed(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ return false;
+ }
+ return true;
+}
+
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ strcpy(cgrp->root->release_agent_path, buffer);
+ cgroup_unlock();
+ return 0;
+}
+
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *seq)
+{
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ seq_puts(seq, cgrp->root->release_agent_path);
+ seq_putc(seq, '\n');
+ cgroup_unlock();
+ return 0;
+}
+
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
+
+static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
+{
+ char buffer[CGROUP_LOCAL_BUFFER_SIZE];
+ int retval = 0;
+ char *end;
+
+ if (!nbytes)
+ return -EINVAL;
+ if (nbytes >= sizeof(buffer))
+ return -E2BIG;
+ if (copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
+
+ buffer[nbytes] = 0; /* nul-terminate */
+ strstrip(buffer);
+ if (cft->write_u64) {
+ u64 val = simple_strtoull(buffer, &end, 0);
+ if (*end)
+ return -EINVAL;
+ retval = cft->write_u64(cgrp, cft, val);
+ } else {
+ s64 val = simple_strtoll(buffer, &end, 0);
+ if (*end)
+ return -EINVAL;
+ retval = cft->write_s64(cgrp, cft, val);
+ }
+ if (!retval)
+ retval = nbytes;
+ return retval;
+}
+
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
+{
+ char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
+ int retval = 0;
+ size_t max_bytes = cft->max_write_len;
+ char *buffer = local_buffer;
+
+ if (!max_bytes)
+ max_bytes = sizeof(local_buffer) - 1;
+ if (nbytes >= max_bytes)
+ return -E2BIG;
+ /* Allocate a dynamic buffer if we need one */
+ if (nbytes >= sizeof(local_buffer)) {
+ buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+ if (buffer == NULL)
+ return -ENOMEM;
+ }
+ if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ buffer[nbytes] = 0; /* nul-terminate */
+ strstrip(buffer);
+ retval = cft->write_string(cgrp, cft, buffer);
+ if (!retval)
+ retval = nbytes;
+out:
+ if (buffer != local_buffer)
+ kfree(buffer);
+ return retval;
+}
+
+static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct cftype *cft = __d_cft(file->f_dentry);
+ struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+ if (!cft || cgroup_is_removed(cgrp))
+ return -ENODEV;
+ if (cft->write)
+ return cft->write(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->write_u64 || cft->write_s64)
+ return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->write_string)
+ return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->trigger) {
+ int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+ return ret ? ret : nbytes;
+ }
+ return -EINVAL;
+}
+
+static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes,
+ loff_t *ppos)
+{
+ char tmp[CGROUP_LOCAL_BUFFER_SIZE];
+ u64 val = cft->read_u64(cgrp, cft);
+ int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
+static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes,
+ loff_t *ppos)
+{
+ char tmp[CGROUP_LOCAL_BUFFER_SIZE];
+ s64 val = cft->read_s64(cgrp, cft);
+ int len = sprintf(tmp, "%lld\n", (long long) val);
+
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
+static ssize_t cgroup_file_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct cftype *cft = __d_cft(file->f_dentry);
+ struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+ if (!cft || cgroup_is_removed(cgrp))
+ return -ENODEV;
+
+ if (cft->read)
+ return cft->read(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->read_u64)
+ return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->read_s64)
+ return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
+ return -EINVAL;
+}
+
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+ struct cftype *cft;
+ struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+ struct seq_file *sf = cb->state;
+ return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct cgroup_seqfile_state *state = m->private;
+ struct cftype *cft = state->cft;
+ if (cft->read_map) {
+ struct cgroup_map_cb cb = {
+ .fill = cgroup_map_add,
+ .state = m,
+ };
+ return cft->read_map(state->cgroup, cft, &cb);
+ }
+ return cft->read_seq_string(state->cgroup, cft, m);
+}
+
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ kfree(seq->private);
+ return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations = {
+ .read = seq_read,
+ .write = cgroup_file_write,
+ .llseek = seq_lseek,
+ .release = cgroup_seqfile_release,
+};
+
+static int cgroup_file_open(struct inode *inode, struct file *file)
+{
+ int err;
+ struct cftype *cft;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ cft = __d_cft(file->f_dentry);
+ if (!cft)
+ return -ENODEV;
+ if (cft->read_map || cft->read_seq_string) {
+ struct cgroup_seqfile_state *state =
+ kzalloc(sizeof(*state), GFP_USER);
+ if (!state)
+ return -ENOMEM;
+ state->cft = cft;
+ state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+ file->f_op = &cgroup_seqfile_operations;
+ err = single_open(file, cgroup_seqfile_show, state);
+ if (err < 0)
+ kfree(state);
+ } else if (cft->open)
+ err = cft->open(inode, file);
+ else
+ err = 0;
+
+ return err;
+}
+
+static int cgroup_file_release(struct inode *inode, struct file *file)
+{
+ struct cftype *cft = __d_cft(file->f_dentry);
+ if (cft->release)
+ return cft->release(inode, file);
+ return 0;
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ return -ENOTDIR;
+ if (new_dentry->d_inode)
+ return -EEXIST;
+ if (old_dir != new_dir)
+ return -EIO;
+ return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static struct file_operations cgroup_file_operations = {
+ .read = cgroup_file_read,
+ .write = cgroup_file_write,
+ .llseek = generic_file_llseek,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
+};
+
+static struct inode_operations cgroup_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static int cgroup_create_file(struct dentry *dentry, int mode,
+ struct super_block *sb)
+{
+ static struct dentry_operations cgroup_dops = {
+ .d_iput = cgroup_diput,
+ };
+
+ struct inode *inode;
+
+ if (!dentry)
+ return -ENOENT;
+ if (dentry->d_inode)
+ return -EEXIST;
+
+ inode = cgroup_new_inode(mode, sb);
+ if (!inode)
+ return -ENOMEM;
+
+ if (S_ISDIR(mode)) {
+ inode->i_op = &cgroup_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /* start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+
+ /* start with the directory inode held, so that we can
+ * populate it without racing with another mkdir */
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ } else if (S_ISREG(mode)) {
+ inode->i_size = 0;
+ inode->i_fop = &cgroup_file_operations;
+ }
+ dentry->d_op = &cgroup_dops;
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ return 0;
+}
+
+/*
+ * cgroup_create_dir - create a directory for an object.
+ * @cgrp: the cgroup we create the directory for. It must have a valid
+ * ->parent field. And we are going to fill its ->dentry field.
+ * @dentry: dentry of the new cgroup
+ * @mode: mode to set on new directory.
+ */
+static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
+ int mode)
+{
+ struct dentry *parent;
+ int error = 0;
+
+ parent = cgrp->parent->dentry;
+ error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
+ if (!error) {
+ dentry->d_fsdata = cgrp;
+ inc_nlink(parent->d_inode);
+ cgrp->dentry = dentry;
+ dget(dentry);
+ }
+ dput(dentry);
+
+ return error;
+}
+
+int cgroup_add_file(struct cgroup *cgrp,
+ struct cgroup_subsys *subsys,
+ const struct cftype *cft)
+{
+ struct dentry *dir = cgrp->dentry;
+ struct dentry *dentry;
+ int error;
+
+ char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+ if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
+ strcpy(name, subsys->name);
+ strcat(name, ".");
+ }
+ strcat(name, cft->name);
+ BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+ dentry = lookup_one_len(name, dir, strlen(name));
+ if (!IS_ERR(dentry)) {
+ error = cgroup_create_file(dentry, 0644 | S_IFREG,
+ cgrp->root->sb);
+ if (!error)
+ dentry->d_fsdata = (void *)cft;
+ dput(dentry);
+ } else
+ error = PTR_ERR(dentry);
+ return error;
+}
+
+int cgroup_add_files(struct cgroup *cgrp,
+ struct cgroup_subsys *subsys,
+ const struct cftype cft[],
+ int count)
+{
+ int i, err;
+ for (i = 0; i < count; i++) {
+ err = cgroup_add_file(cgrp, subsys, &cft[i]);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
+int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cg_cgroup_link *link;
+
+ read_lock(&css_set_lock);
+ list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+ count += atomic_read(&link->cg->refcount);
+ }
+ read_unlock(&css_set_lock);
+ return count;
+}
+
+/*
+ * Advance a list_head iterator. The iterator should be positioned at
+ * the start of a css_set
+ */
+static void cgroup_advance_iter(struct cgroup *cgrp,
+ struct cgroup_iter *it)
+{
+ struct list_head *l = it->cg_link;
+ struct cg_cgroup_link *link;
+ struct css_set *cg;
+
+ /* Advance to the next non-empty css_set */
+ do {
+ l = l->next;
+ if (l == &cgrp->css_sets) {
+ it->cg_link = NULL;
+ return;
+ }
+ link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+ cg = link->cg;
+ } while (list_empty(&cg->tasks));
+ it->cg_link = l;
+ it->task = cg->tasks.next;
+}
+
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+static void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+ write_lock(&css_set_lock);
+ use_task_css_set_links = 1;
+ do_each_thread(g, p) {
+ task_lock(p);
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ */
+ if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
+ list_add(&p->cg_list, &p->cgroups->tasks);
+ task_unlock(p);
+ } while_each_thread(g, p);
+ write_unlock(&css_set_lock);
+}
+
+void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+{
+ /*
+ * The first time anyone tries to iterate across a cgroup,
+ * we need to enable the list linking each css_set to its
+ * tasks, and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ read_lock(&css_set_lock);
+ it->cg_link = &cgrp->css_sets;
+ cgroup_advance_iter(cgrp, it);
+}
+
+struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
+ struct cgroup_iter *it)
+{
+ struct task_struct *res;
+ struct list_head *l = it->task;
+
+ /* If the iterator cg is NULL, we have no tasks */
+ if (!it->cg_link)
+ return NULL;
+ res = list_entry(l, struct task_struct, cg_list);
+ /* Advance iterator to find next entry */
+ l = l->next;
+ if (l == &res->cgroups->tasks) {
+ /* We reached the end of this task list - move on to
+ * the next cg_cgroup_link */
+ cgroup_advance_iter(cgrp, it);
+ } else {
+ it->task = l;
+ }
+ return res;
+}
+
+void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+{
+ read_unlock(&css_set_lock);
+}
+
+static inline int started_after_time(struct task_struct *t1,
+ struct timespec *time,
+ struct task_struct *t2)
+{
+ int start_diff = timespec_compare(&t1->start_time, time);
+ if (start_diff > 0) {
+ return 1;
+ } else if (start_diff < 0) {
+ return 0;
+ } else {
+ /*
+ * Arbitrarily, if two processes started at the same
+ * time, we'll say that the lower pointer value
+ * started first. Note that t2 may have exited by now
+ * so this may not be a valid pointer any longer, but
+ * that's fine - it still serves to distinguish
+ * between two tasks started (effectively) simultaneously.
+ */
+ return t1 > t2;
+ }
+}
+
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+ struct task_struct *t1 = p1;
+ struct task_struct *t2 = p2;
+ return started_after_time(t1, &t2->start_time, t2);
+}
+
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+ int retval, i;
+ struct cgroup_iter it;
+ struct task_struct *p, *dropped;
+ /* Never dereference latest_task, since it's not refcounted */
+ struct task_struct *latest_task = NULL;
+ struct ptr_heap tmp_heap;
+ struct ptr_heap *heap;
+ struct timespec latest_time = { 0, 0 };
+
+ if (scan->heap) {
+ /* The caller supplied our heap and pre-allocated its memory */
+ heap = scan->heap;
+ heap->gt = &started_after;
+ } else {
+ /* We need to allocate our own heap memory */
+ heap = &tmp_heap;
+ retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+ if (retval)
+ /* cannot allocate the heap */
+ return retval;
+ }
+
+ again:
+ /*
+ * Scan tasks in the cgroup, using the scanner's "test_task" callback
+ * to determine which are of interest, and using the scanner's
+ * "process_task" callback to process any of them that need an update.
+ * Since we don't want to hold any locks during the task updates,
+ * gather tasks to be processed in a heap structure.
+ * The heap is sorted by descending task start time.
+ * If the statically-sized heap fills up, we overflow tasks that
+ * started later, and in future iterations only consider tasks that
+ * started after the latest task in the previous pass. This
+ * guarantees forward progress and that we don't miss any tasks.
+ */
+ heap->size = 0;
+ cgroup_iter_start(scan->cg, &it);
+ while ((p = cgroup_iter_next(scan->cg, &it))) {
+ /*
+ * Only affect tasks that qualify per the caller's callback,
+ * if he provided one
+ */
+ if (scan->test_task && !scan->test_task(p, scan))
+ continue;
+ /*
+ * Only process tasks that started after the last task
+ * we processed
+ */
+ if (!started_after_time(p, &latest_time, latest_task))
+ continue;
+ dropped = heap_insert(heap, p);
+ if (dropped == NULL) {
+ /*
+ * The new task was inserted; the heap wasn't
+ * previously full
+ */
+ get_task_struct(p);
+ } else if (dropped != p) {
+ /*
+ * The new task was inserted, and pushed out a
+ * different task
+ */
+ get_task_struct(p);
+ put_task_struct(dropped);
+ }
+ /*
+ * Else the new task was newer than anything already in
+ * the heap and wasn't inserted
+ */
+ }
+ cgroup_iter_end(scan->cg, &it);
+
+ if (heap->size) {
+ for (i = 0; i < heap->size; i++) {
+ struct task_struct *q = heap->ptrs[i];
+ if (i == 0) {
+ latest_time = q->start_time;
+ latest_task = q;
+ }
+ /* Process the task per the caller's callback */
+ scan->process_task(q, scan);
+ put_task_struct(q);
+ }
+ /*
+ * If we had to process any tasks at all, scan again
+ * in case some of them were in the middle of forking
+ * children that didn't get processed.
+ * Not the most efficient way to do it, but it avoids
+ * having to take callback_mutex in the fork path
+ */
+ goto again;
+ }
+ if (heap == &tmp_heap)
+ heap_free(&tmp_heap);
+ return 0;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using cgroup
+ * 'cgrp'. Return actual number of pids loaded. No need to
+ * task_lock(p) when reading out p->cgroup, since we're in an RCU
+ * read section, so the css_set can't go away, and is
+ * immutable after creation.
+ */
+static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+{
+ int n = 0;
+ struct cgroup_iter it;
+ struct task_struct *tsk;
+ cgroup_iter_start(cgrp, &it);
+ while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ if (unlikely(n == npids))
+ break;
+ pidarray[n++] = task_pid_vnr(tsk);
+ }
+ cgroup_iter_end(cgrp, &it);
+ return n;
+}
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ int ret = -EINVAL;
+ struct cgroup *cgrp;
+ struct cgroup_iter it;
+ struct task_struct *tsk;
+
+ /*
+ * Validate dentry by checking the superblock operations,
+ * and make sure it's a directory.
+ */
+ if (dentry->d_sb->s_op != &cgroup_ops ||
+ !S_ISDIR(dentry->d_inode->i_mode))
+ goto err;
+
+ ret = 0;
+ cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+
+ cgroup_iter_start(cgrp, &it);
+ while ((tsk = cgroup_iter_next(cgrp, &it))) {
+ switch (tsk->state) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (delayacct_is_task_waiting_on_io(tsk))
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ cgroup_iter_end(cgrp, &it);
+
+ rcu_read_unlock();
+err:
+ return ret;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+
+/*
+ * seq_file methods for the "tasks" file. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->tasks_pids array.
+ */
+
+static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct cgroup *cgrp = s->private;
+ int index = 0, pid = *pos;
+ int *iter;
+
+ down_read(&cgrp->pids_mutex);
+ if (pid) {
+ int end = cgrp->pids_length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (cgrp->tasks_pids[mid] == pid) {
+ index = mid;
+ break;
+ } else if (cgrp->tasks_pids[mid] <= pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= cgrp->pids_length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = cgrp->tasks_pids + index;
+ *pos = *iter;
+ return iter;
+}
+
+static void cgroup_tasks_stop(struct seq_file *s, void *v)
+{
+ struct cgroup *cgrp = s->private;
+ up_read(&cgrp->pids_mutex);
+}
+
+static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct cgroup *cgrp = s->private;
+ int *p = v;
+ int *end = cgrp->tasks_pids + cgrp->pids_length;
+
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ return NULL;
+ } else {
+ *pos = *p;
+ return p;
+ }
+}
+
+static int cgroup_tasks_show(struct seq_file *s, void *v)
+{
+ return seq_printf(s, "%d\n", *(int *)v);
+}
+
+static struct seq_operations cgroup_tasks_seq_operations = {
+ .start = cgroup_tasks_start,
+ .stop = cgroup_tasks_stop,
+ .next = cgroup_tasks_next,
+ .show = cgroup_tasks_show,
+};
+
+static void release_cgroup_pid_array(struct cgroup *cgrp)
+{
+ down_write(&cgrp->pids_mutex);
+ BUG_ON(!cgrp->pids_use_count);
+ if (!--cgrp->pids_use_count) {
+ kfree(cgrp->tasks_pids);
+ cgrp->tasks_pids = NULL;
+ cgrp->pids_length = 0;
+ }
+ up_write(&cgrp->pids_mutex);
+}
+
+static int cgroup_tasks_release(struct inode *inode, struct file *file)
+{
+ struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+ if (!(file->f_mode & FMODE_READ))
+ return 0;
+
+ release_cgroup_pid_array(cgrp);
+ return seq_release(inode, file);
+}
+
+static struct file_operations cgroup_tasks_operations = {
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = cgroup_file_write,
+ .release = cgroup_tasks_release,
+};
+
+/*
+ * Handle an open on 'tasks' file. Prepare an array containing the
+ * process id's of tasks currently attached to the cgroup being opened.
+ */
+
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+ struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ pid_t *pidarray;
+ int npids;
+ int retval;
+
+ /* Nothing to do for write-only files */
+ if (!(file->f_mode & FMODE_READ))
+ return 0;
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ npids = cgroup_task_count(cgrp);
+ pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+ if (!pidarray)
+ return -ENOMEM;
+ npids = pid_array_load(pidarray, npids, cgrp);
+ sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+
+ /*
+ * Store the array in the cgroup, freeing the old
+ * array if necessary
+ */
+ down_write(&cgrp->pids_mutex);
+ kfree(cgrp->tasks_pids);
+ cgrp->tasks_pids = pidarray;
+ cgrp->pids_length = npids;
+ cgrp->pids_use_count++;
+ up_write(&cgrp->pids_mutex);
+
+ file->f_op = &cgroup_tasks_operations;
+
+ retval = seq_open(file, &cgroup_tasks_seq_operations);
+ if (retval) {
+ release_cgroup_pid_array(cgrp);
+ return retval;
+ }
+ ((struct seq_file *)file->private_data)->private = cgrp;
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return notify_on_release(cgrp);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+static struct cftype files[] = {
+ {
+ .name = "tasks",
+ .open = cgroup_tasks_open,
+ .write_u64 = cgroup_tasks_write,
+ .release = cgroup_tasks_release,
+ .private = FILE_TASKLIST,
+ },
+
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ .private = FILE_NOTIFY_ON_RELEASE,
+ },
+};
+
+static struct cftype cft_release_agent = {
+ .name = "release_agent",
+ .read_seq_string = cgroup_release_agent_show,
+ .write_string = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX,
+ .private = FILE_RELEASE_AGENT,
+};
+
+static int cgroup_populate_dir(struct cgroup *cgrp)
+{
+ int err;
+ struct cgroup_subsys *ss;
+
+ /* First clear out any existing files */
+ cgroup_clear_directory(cgrp->dentry);
+
+ err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
+ if (err < 0)
+ return err;
+
+ if (cgrp == cgrp->top_cgroup) {
+ if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
+ return err;
+ }
+
+ for_each_subsys(cgrp->root, ss) {
+ if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static void init_cgroup_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ css->cgroup = cgrp;
+ atomic_set(&css->refcnt, 0);
+ css->flags = 0;
+ if (cgrp == dummytop)
+ set_bit(CSS_ROOT, &css->flags);
+ BUG_ON(cgrp->subsys[ss->subsys_id]);
+ cgrp->subsys[ss->subsys_id] = css;
+}
+
+/*
+ * cgroup_create - create a cgroup
+ * @parent: cgroup that will be parent of the new cgroup
+ * @dentry: dentry of the new cgroup
+ * @mode: mode to set on new inode
+ *
+ * Must be called with the mutex on the parent inode held
+ */
+static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ int mode)
+{
+ struct cgroup *cgrp;
+ struct cgroupfs_root *root = parent->root;
+ int err = 0;
+ struct cgroup_subsys *ss;
+ struct super_block *sb = root->sb;
+
+ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+ if (!cgrp)
+ return -ENOMEM;
+
+ /* Grab a reference on the superblock so the hierarchy doesn't
+ * get deleted on unmount if there are child cgroups. This
+ * can be done outside cgroup_mutex, since the sb can't
+ * disappear while someone has an open control file on the
+ * fs */
+ atomic_inc(&sb->s_active);
+
+ mutex_lock(&cgroup_mutex);
+
+ init_cgroup_housekeeping(cgrp);
+
+ cgrp->parent = parent;
+ cgrp->root = parent->root;
+ cgrp->top_cgroup = parent->top_cgroup;
+
+ if (notify_on_release(parent))
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
+ for_each_subsys(root, ss) {
+ struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+ if (IS_ERR(css)) {
+ err = PTR_ERR(css);
+ goto err_destroy;
+ }
+ init_cgroup_css(css, ss, cgrp);
+ }
+
+ list_add(&cgrp->sibling, &cgrp->parent->children);
+ root->number_of_cgroups++;
+
+ err = cgroup_create_dir(cgrp, dentry, mode);
+ if (err < 0)
+ goto err_remove;
+
+ /* The cgroup directory was pre-locked for us */
+ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+
+ err = cgroup_populate_dir(cgrp);
+ /* If err < 0, we have a half-filled directory - oh well ;) */
+
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+
+ return 0;
+
+ err_remove:
+
+ list_del(&cgrp->sibling);
+ root->number_of_cgroups--;
+
+ err_destroy:
+
+ for_each_subsys(root, ss) {
+ if (cgrp->subsys[ss->subsys_id])
+ ss->destroy(ss, cgrp);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* Release the reference count that we took on the superblock */
+ deactivate_super(sb);
+
+ kfree(cgrp);
+ return err;
+}
+
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+
+ /* the vfs holds inode->i_mutex already */
+ return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+}
+
+static int cgroup_has_css_refs(struct cgroup *cgrp)
+{
+ /* Check the reference count on each subsystem. Since we
+ * already established that there are no tasks in the
+ * cgroup, if the css refcount is also 0, then there should
+ * be no outstanding references, so the subsystem is safe to
+ * destroy. We scan across all subsystems rather than using
+ * the per-hierarchy linked list of mounted subsystems since
+ * we can be called via check_for_release() with no
+ * synchronization other than RCU, and the subsystem linked
+ * list isn't RCU-safe */
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ struct cgroup_subsys_state *css;
+ /* Skip subsystems not in this hierarchy */
+ if (ss->root != cgrp->root)
+ continue;
+ css = cgrp->subsys[ss->subsys_id];
+ /* When called from check_for_release() it's possible
+ * that by this point the cgroup has been removed
+ * and the css deleted. But a false-positive doesn't
+ * matter, since it can only happen if the cgroup
+ * has been deleted and hence no longer needs the
+ * release agent to be called anyway. */
+ if (css && atomic_read(&css->refcnt))
+ return 1;
+ }
+ return 0;
+}
+
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+ struct cgroup *cgrp = dentry->d_fsdata;
+ struct dentry *d;
+ struct cgroup *parent;
+ struct super_block *sb;
+ struct cgroupfs_root *root;
+
+ /* the vfs holds both inode->i_mutex already */
+
+ mutex_lock(&cgroup_mutex);
+ if (atomic_read(&cgrp->count) != 0) {
+ mutex_unlock(&cgroup_mutex);
+ return -EBUSY;
+ }
+ if (!list_empty(&cgrp->children)) {
+ mutex_unlock(&cgroup_mutex);
+ return -EBUSY;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ /*
+ * Call pre_destroy handlers of subsys. Notify subsystems
+ * that rmdir() request comes.
+ */
+ cgroup_call_pre_destroy(cgrp);
+
+ mutex_lock(&cgroup_mutex);
+ parent = cgrp->parent;
+ root = cgrp->root;
+ sb = root->sb;
+
+ if (atomic_read(&cgrp->count)
+ || !list_empty(&cgrp->children)
+ || cgroup_has_css_refs(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ return -EBUSY;
+ }
+
+ spin_lock(&release_list_lock);
+ set_bit(CGRP_REMOVED, &cgrp->flags);
+ if (!list_empty(&cgrp->release_list))
+ list_del(&cgrp->release_list);
+ spin_unlock(&release_list_lock);
+ /* delete my sibling from parent->children */
+ list_del(&cgrp->sibling);
+ spin_lock(&cgrp->dentry->d_lock);
+ d = dget(cgrp->dentry);
+ spin_unlock(&d->d_lock);
+
+ cgroup_d_remove_dir(d);
+ dput(d);
+
+ set_bit(CGRP_RELEASABLE, &parent->flags);
+ check_for_release(parent);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+
+ /* Create the top cgroup state for this subsystem */
+ ss->root = &rootnode;
+ css = ss->create(ss, dummytop);
+ /* We don't handle early failures gracefully */
+ BUG_ON(IS_ERR(css));
+ init_cgroup_css(css, ss, dummytop);
+
+ /* Update the init_css_set to contain a subsys
+ * pointer to this state - since the subsystem is
+ * newly registered, all tasks and hence the
+ * init_css_set is in the subsystem's top cgroup. */
+ init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+
+ need_forkexit_callback |= ss->fork || ss->exit;
+ need_mm_owner_callback |= !!ss->mm_owner_changed;
+
+ /* At system boot, before all subsystems have been
+ * registered, no tasks have been forked, so we don't
+ * need to invoke fork callbacks here. */
+ BUG_ON(!list_empty(&init_task.tasks));
+
+ ss->active = 1;
+}
+
+/**
+ * cgroup_init_early - cgroup initialization at system boot
+ *
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
+ */
+int __init cgroup_init_early(void)
+{
+ int i;
+ atomic_set(&init_css_set.refcount, 1);
+ INIT_LIST_HEAD(&init_css_set.cg_links);
+ INIT_LIST_HEAD(&init_css_set.tasks);
+ INIT_HLIST_NODE(&init_css_set.hlist);
+ css_set_count = 1;
+ init_cgroup_root(&rootnode);
+ list_add(&rootnode.root_list, &roots);
+ root_count = 1;
+ init_task.cgroups = &init_css_set;
+
+ init_css_set_link.cg = &init_css_set;
+ list_add(&init_css_set_link.cgrp_link_list,
+ &rootnode.top_cgroup.css_sets);
+ list_add(&init_css_set_link.cg_link_list,
+ &init_css_set.cg_links);
+
+ for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&css_set_table[i]);
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ BUG_ON(!ss->name);
+ BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
+ BUG_ON(!ss->create);
+ BUG_ON(!ss->destroy);
+ if (ss->subsys_id != i) {
+ printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
+ ss->name, ss->subsys_id);
+ BUG();
+ }
+
+ if (ss->early_init)
+ cgroup_init_subsys(ss);
+ }
+ return 0;
+}
+
+/**
+ * cgroup_init - cgroup initialization
+ *
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
+ */
+int __init cgroup_init(void)
+{
+ int err;
+ int i;
+ struct hlist_head *hhead;
+
+ err = bdi_init(&cgroup_backing_dev_info);
+ if (err)
+ return err;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (!ss->early_init)
+ cgroup_init_subsys(ss);
+ }
+
+ /* Add init_css_set to the hash table */
+ hhead = css_set_hash(init_css_set.subsys);
+ hlist_add_head(&init_css_set.hlist, hhead);
+
+ err = register_filesystem(&cgroup_fs_type);
+ if (err < 0)
+ goto out;
+
+ proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+
+out:
+ if (err)
+ bdi_destroy(&cgroup_backing_dev_info);
+
+ return err;
+}
+
+/*
+ * proc_cgroup_show()
+ * - Print task's cgroup paths into seq_file, one line for each hierarchy
+ * - Used for /proc/<pid>/cgroup.
+ * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
+ * doesn't really matter if tsk->cgroup changes after we read it,
+ * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
+ * anyway. No need to check that tsk->cgroup != NULL, thanks to
+ * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
+ * cgroup to top_cgroup.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_cgroup_show(struct seq_file *m, void *v)
+{
+ struct pid *pid;
+ struct task_struct *tsk;
+ char *buf;
+ int retval;
+ struct cgroupfs_root *root;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ retval = -ESRCH;
+ pid = m->private;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (!tsk)
+ goto out_free;
+
+ retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+
+ for_each_root(root) {
+ struct cgroup_subsys *ss;
+ struct cgroup *cgrp;
+ int subsys_id;
+ int count = 0;
+
+ /* Skip this hierarchy if it has no active subsystems */
+ if (!root->actual_subsys_bits)
+ continue;
+ seq_printf(m, "%lu:", root->subsys_bits);
+ for_each_subsys(root, ss)
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ seq_putc(m, ':');
+ get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
+ cgrp = task_cgroup(tsk, subsys_id);
+ retval = cgroup_path(cgrp, buf, PAGE_SIZE);
+ if (retval < 0)
+ goto out_unlock;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ }
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ put_task_struct(tsk);
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cgroup_show, pid);
+}
+
+struct file_operations proc_cgroup_operations = {
+ .open = cgroup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ mutex_lock(&cgroup_mutex);
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ seq_printf(m, "%s\t%lu\t%d\t%d\n",
+ ss->name, ss->root->subsys_bits,
+ ss->root->number_of_cgroups, !ss->disabled);
+ }
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+static struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * cgroup_fork - attach newly forked task to its parents cgroup.
+ * @child: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's cgroup at fork().
+ *
+ * A pointer to the shared css_set was automatically copied in
+ * fork.c by dup_task_struct(). However, we ignore that copy, since
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer. cgroup_attach_task() might
+ * have already changed current->cgroups, allowing the previously
+ * referenced cgroup group to be removed and freed.
+ *
+ * At the point that cgroup_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+ task_lock(current);
+ child->cgroups = current->cgroups;
+ get_css_set(child->cgroups);
+ task_unlock(current);
+ INIT_LIST_HEAD(&child->cg_list);
+}
+
+/**
+ * cgroup_fork_callbacks - run fork callbacks
+ * @child: the new task
+ *
+ * Called on a new task very soon before adding it to the
+ * tasklist. No need to take any locks since no-one can
+ * be operating on this task.
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+ if (need_forkexit_callback) {
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->fork)
+ ss->fork(ss, child);
+ }
+ }
+}
+
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ *
+ * The callbacks are invoked with mmap_sem held in read mode.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+ struct cgroup *oldcgrp, *newcgrp = NULL;
+
+ if (need_mm_owner_callback) {
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ oldcgrp = task_cgroup(old, ss->subsys_id);
+ if (new)
+ newcgrp = task_cgroup(new, ss->subsys_id);
+ if (oldcgrp == newcgrp)
+ continue;
+ if (ss->mm_owner_changed)
+ ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
+ }
+ }
+}
+#endif /* CONFIG_MM_OWNER */
+
+/**
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+ * Adds the task to the list running through its css_set if necessary.
+ * Has to be after the task is visible on the task list in case we race
+ * with the first call to cgroup_iter_start() - to guarantee that the
+ * new task ends up on its list.
+ */
+void cgroup_post_fork(struct task_struct *child)
+{
+ if (use_task_css_set_links) {
+ write_lock(&css_set_lock);
+ if (list_empty(&child->cg_list))
+ list_add(&child->cg_list, &child->cgroups->tasks);
+ write_unlock(&css_set_lock);
+ }
+}
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ * @run_callback: run exit callbacks?
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems. Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * the_top_cgroup_hack:
+ *
+ * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
+ *
+ * We call cgroup_exit() while the task is still competent to
+ * handle notify_on_release(), then leave the task attached to the
+ * root cgroup in each hierarchy for the remainder of its exit.
+ *
+ * To do this properly, we would increment the reference count on
+ * top_cgroup, and near the very end of the kernel/exit.c do_exit()
+ * code we would add a second cgroup function call, to drop that
+ * reference. This would just create an unnecessary hot spot on
+ * the top_cgroup reference count, to no avail.
+ *
+ * Normally, holding a reference to a cgroup without bumping its
+ * count is unsafe. The cgroup could go away, or someone could
+ * attach us to a different cgroup, decrementing the count on
+ * the first cgroup that we never incremented. But in this case,
+ * top_cgroup isn't going away, and either task has PF_EXITING set,
+ * which wards off any cgroup_attach_task() attempts, or task is a failed
+ * fork, never visible to cgroup_attach_task.
+ */
+void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+{
+ int i;
+ struct css_set *cg;
+
+ if (run_callbacks && need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit)
+ ss->exit(ss, tsk);
+ }
+ }
+
+ /*
+ * Unlink from the css_set task list if necessary.
+ * Optimistically check cg_list before taking
+ * css_set_lock
+ */
+ if (!list_empty(&tsk->cg_list)) {
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_del(&tsk->cg_list);
+ write_unlock(&css_set_lock);
+ }
+
+ /* Reassign the task to the init_css_set. */
+ task_lock(tsk);
+ cg = tsk->cgroups;
+ tsk->cgroups = &init_css_set;
+ task_unlock(tsk);
+ if (cg)
+ put_css_set_taskexit(cg);
+}
+
+/**
+ * cgroup_clone - clone the cgroup the given subsystem is attached to
+ * @tsk: the task to be moved
+ * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
+ *
+ * Duplicate the current cgroup in the hierarchy that the given
+ * subsystem is attached to, and move this task into the new
+ * child.
+ */
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+ char *nodename)
+{
+ struct dentry *dentry;
+ int ret = 0;
+ struct cgroup *parent, *child;
+ struct inode *inode;
+ struct css_set *cg;
+ struct cgroupfs_root *root;
+ struct cgroup_subsys *ss;
+
+ /* We shouldn't be called by an unregistered subsystem */
+ BUG_ON(!subsys->active);
+
+ /* First figure out what hierarchy and cgroup we're dealing
+ * with, and pin them so we can drop cgroup_mutex */
+ mutex_lock(&cgroup_mutex);
+ again:
+ root = subsys->root;
+ if (root == &rootnode) {
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+ }
+ cg = tsk->cgroups;
+ parent = task_cgroup(tsk, subsys->subsys_id);
+
+ /* Pin the hierarchy */
+ if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+ /* We race with the final deactivate_super() */
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+ }
+
+ /* Keep the cgroup alive */
+ get_css_set(cg);
+ mutex_unlock(&cgroup_mutex);
+
+ /* Now do the VFS work to create a cgroup */
+ inode = parent->dentry->d_inode;
+
+ /* Hold the parent directory mutex across this operation to
+ * stop anyone else deleting the new cgroup */
+ mutex_lock(&inode->i_mutex);
+ dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
+ if (IS_ERR(dentry)) {
+ printk(KERN_INFO
+ "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
+ PTR_ERR(dentry));
+ ret = PTR_ERR(dentry);
+ goto out_release;
+ }
+
+ /* Create the cgroup directory, which also creates the cgroup */
+ ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+ child = __d_cgrp(dentry);
+ dput(dentry);
+ if (ret) {
+ printk(KERN_INFO
+ "Failed to create cgroup %s: %d\n", nodename,
+ ret);
+ goto out_release;
+ }
+
+ if (!child) {
+ printk(KERN_INFO
+ "Couldn't find new cgroup %s\n", nodename);
+ ret = -ENOMEM;
+ goto out_release;
+ }
+
+ /* The cgroup now exists. Retake cgroup_mutex and check
+ * that we're still in the same state that we thought we
+ * were. */
+ mutex_lock(&cgroup_mutex);
+ if ((root != subsys->root) ||
+ (parent != task_cgroup(tsk, subsys->subsys_id))) {
+ /* Aargh, we raced ... */
+ mutex_unlock(&inode->i_mutex);
+ put_css_set(cg);
+
+ deactivate_super(parent->root->sb);
+ /* The cgroup is still accessible in the VFS, but
+ * we're not going to try to rmdir() it at this
+ * point. */
+ printk(KERN_INFO
+ "Race in cgroup_clone() - leaking cgroup %s\n",
+ nodename);
+ goto again;
+ }
+
+ /* do any required auto-setup */
+ for_each_subsys(root, ss) {
+ if (ss->post_clone)
+ ss->post_clone(ss, child);
+ }
+
+ /* All seems fine. Finish by moving the task into the new cgroup */
+ ret = cgroup_attach_task(child, tsk);
+ mutex_unlock(&cgroup_mutex);
+
+ out_release:
+ mutex_unlock(&inode->i_mutex);
+
+ mutex_lock(&cgroup_mutex);
+ put_css_set(cg);
+ mutex_unlock(&cgroup_mutex);
+ deactivate_super(parent->root->sb);
+ return ret;
+}
+
+/**
+ * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * @cgrp: the cgroup in question
+ *
+ * See if @cgrp is a descendant of the current task's cgroup in
+ * the appropriate hierarchy.
+ *
+ * If we are sending in dummytop, then presumably we are creating
+ * the top cgroup in the subsystem.
+ *
+ * Called only by the ns (nsproxy) cgroup.
+ */
+int cgroup_is_descendant(const struct cgroup *cgrp)
+{
+ int ret;
+ struct cgroup *target;
+ int subsys_id;
+
+ if (cgrp == dummytop)
+ return 1;
+
+ get_first_subsys(cgrp, NULL, &subsys_id);
+ target = task_cgroup(current, subsys_id);
+ while (cgrp != target && cgrp!= cgrp->top_cgroup)
+ cgrp = cgrp->parent;
+ ret = (cgrp == target);
+ return ret;
+}
+
+static void check_for_release(struct cgroup *cgrp)
+{
+ /* All of these checks rely on RCU to keep the cgroup
+ * structure alive */
+ if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
+ && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
+ /* Control Group is currently removeable. If it's not
+ * already queued for a userspace notification, queue
+ * it now */
+ int need_schedule_work = 0;
+ spin_lock(&release_list_lock);
+ if (!cgroup_is_removed(cgrp) &&
+ list_empty(&cgrp->release_list)) {
+ list_add(&cgrp->release_list, &release_list);
+ need_schedule_work = 1;
+ }
+ spin_unlock(&release_list_lock);
+ if (need_schedule_work)
+ schedule_work(&release_agent_work);
+ }
+}
+
+void __css_put(struct cgroup_subsys_state *css)
+{
+ struct cgroup *cgrp = css->cgroup;
+ rcu_read_lock();
+ if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+static void cgroup_release_agent(struct work_struct *work)
+{
+ BUG_ON(work != &release_agent_work);
+ mutex_lock(&cgroup_mutex);
+ spin_lock(&release_list_lock);
+ while (!list_empty(&release_list)) {
+ char *argv[3], *envp[3];
+ int i;
+ char *pathbuf = NULL, *agentbuf = NULL;
+ struct cgroup *cgrp = list_entry(release_list.next,
+ struct cgroup,
+ release_list);
+ list_del_init(&cgrp->release_list);
+ spin_unlock(&release_list_lock);
+ pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!pathbuf)
+ goto continue_free;
+ if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ goto continue_free;
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!agentbuf)
+ goto continue_free;
+
+ i = 0;
+ argv[i++] = agentbuf;
+ argv[i++] = pathbuf;
+ argv[i] = NULL;
+
+ i = 0;
+ /* minimal command environment */
+ envp[i++] = "HOME=/";
+ envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[i] = NULL;
+
+ /* Drop the lock while we invoke the usermode helper,
+ * since the exec could involve hitting disk and hence
+ * be a slow process */
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ mutex_lock(&cgroup_mutex);
+ continue_free:
+ kfree(pathbuf);
+ kfree(agentbuf);
+ spin_lock(&release_list_lock);
+ }
+ spin_unlock(&release_list_lock);
+ mutex_unlock(&cgroup_mutex);
+}
+
+static int __init cgroup_disable(char *str)
+{
+ int i;
+ char *token;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ if (!strcmp(token, ss->name)) {
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group"
+ " subsystem\n", ss->name);
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("cgroup_disable=", cgroup_disable);
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
new file mode 100644
index 0000000..daca620
--- /dev/null
+++ b/kernel/cgroup_debug.c
@@ -0,0 +1,107 @@
+/*
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
+ * exposes debug info
+ *
+ * Copyright (C) Google Inc, 2007
+ *
+ * Developed by Paul Menage (menage@google.com)
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+
+#include <asm/atomic.h>
+
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ return atomic_read(&cont->count);
+}
+
+static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ u64 count;
+
+ cgroup_lock();
+ count = cgroup_task_count(cont);
+ cgroup_unlock();
+ return count;
+}
+
+static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+{
+ return (u64)(long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup *cont,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&current->cgroups->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "cgroup_refcount",
+ .read_u64 = cgroup_refcount_read,
+ },
+ {
+ .name = "taskcount",
+ .read_u64 = taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+};
+
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+}
+
+struct cgroup_subsys debug_subsys = {
+ .name = "debug",
+ .create = debug_create,
+ .destroy = debug_destroy,
+ .populate = debug_populate,
+ .subsys_id = debug_subsys_id,
+};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 0000000..fb249e2
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
+/*
+ * cgroup_freezer.c - control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+
+enum freezer_state {
+ CGROUP_THAWED = 0,
+ CGROUP_FREEZING,
+ CGROUP_FROZEN,
+};
+
+struct freezer {
+ struct cgroup_subsys_state css;
+ enum freezer_state state;
+ spinlock_t lock; /* protects _writes_ to state */
+};
+
+static inline struct freezer *cgroup_freezer(
+ struct cgroup *cgroup)
+{
+ return container_of(
+ cgroup_subsys_state(cgroup, freezer_subsys_id),
+ struct freezer, css);
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, freezer_subsys_id),
+ struct freezer, css);
+}
+
+int cgroup_frozen(struct task_struct *task)
+{
+ struct freezer *freezer;
+ enum freezer_state state;
+
+ task_lock(task);
+ freezer = task_freezer(task);
+ state = freezer->state;
+ task_unlock(task);
+
+ return state == CGROUP_FROZEN;
+}
+
+/*
+ * cgroups_write_string() limits the size of freezer state strings to
+ * CGROUP_LOCAL_BUFFER_SIZE
+ */
+static const char *freezer_state_strs[] = {
+ "THAWED",
+ "FREEZING",
+ "FROZEN",
+};
+
+/*
+ * State diagram
+ * Transitions are caused by userspace writes to the freezer.state file.
+ * The values in parenthesis are state labels. The rest are edge labels.
+ *
+ * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
+ * ^ ^ | |
+ * | \_______THAWED_______/ |
+ * \__________________________THAWED____________/
+ */
+
+struct cgroup_subsys freezer_subsys;
+
+/* Locks taken and their ordering
+ * ------------------------------
+ * css_set_lock
+ * cgroup_mutex (AKA cgroup_lock)
+ * task->alloc_lock (AKA task_lock)
+ * freezer->lock
+ * task->sighand->siglock
+ *
+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
+ *
+ * freezer_create(), freezer_destroy():
+ * cgroup_mutex [ by cgroup core ]
+ *
+ * can_attach():
+ * cgroup_mutex
+ *
+ * cgroup_frozen():
+ * task->alloc_lock (to get task's cgroup)
+ *
+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
+ * task->alloc_lock (to get task's cgroup)
+ * freezer->lock
+ * sighand->siglock (if the cgroup is freezing)
+ *
+ * freezer_read():
+ * cgroup_mutex
+ * freezer->lock
+ * read_lock css_set_lock (cgroup iterator start)
+ *
+ * freezer_write() (freeze):
+ * cgroup_mutex
+ * freezer->lock
+ * read_lock css_set_lock (cgroup iterator start)
+ * sighand->siglock
+ *
+ * freezer_write() (unfreeze):
+ * cgroup_mutex
+ * freezer->lock
+ * read_lock css_set_lock (cgroup iterator start)
+ * task->alloc_lock (to prevent races with freeze_task())
+ * sighand->siglock
+ */
+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ struct freezer *freezer;
+
+ freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+ if (!freezer)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&freezer->lock);
+ freezer->state = CGROUP_THAWED;
+ return &freezer->css;
+}
+
+static void freezer_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ kfree(cgroup_freezer(cgroup));
+}
+
+/* Task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+ return frozen(task) ||
+ (task_is_stopped_or_traced(task) && freezing(task));
+}
+
+/*
+ * The call to cgroup_lock() in the freezer.state write method prevents
+ * a write to that file racing against an attach, and hence the
+ * can_attach() result will remain valid until the attach completes.
+ */
+static int freezer_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *new_cgroup,
+ struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ /*
+ * Anything frozen can't move or be moved to/from.
+ *
+ * Since orig_freezer->state == FROZEN means that @task has been
+ * frozen, so it's sufficient to check the latter condition.
+ */
+
+ if (is_task_frozen_enough(task))
+ return -EBUSY;
+
+ freezer = cgroup_freezer(new_cgroup);
+ if (freezer->state == CGROUP_FROZEN)
+ return -EBUSY;
+
+ return 0;
+}
+
+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ /*
+ * No lock is needed, since the task isn't on tasklist yet,
+ * so it can't be moved to another cgroup, which means the
+ * freezer won't be removed and will be valid during this
+ * function call.
+ */
+ freezer = task_freezer(task);
+
+ /*
+ * The root cgroup is non-freezable, so we can skip the
+ * following check.
+ */
+ if (!freezer->css.cgroup->parent)
+ return;
+
+ spin_lock_irq(&freezer->lock);
+ BUG_ON(freezer->state == CGROUP_FROZEN);
+
+ /* Locking avoids race with FREEZING -> THAWED transitions. */
+ if (freezer->state == CGROUP_FREEZING)
+ freeze_task(task, true);
+ spin_unlock_irq(&freezer->lock);
+}
+
+/*
+ * caller must hold freezer->lock
+ */
+static void update_freezer_state(struct cgroup *cgroup,
+ struct freezer *freezer)
+{
+ struct cgroup_iter it;
+ struct task_struct *task;
+ unsigned int nfrozen = 0, ntotal = 0;
+
+ cgroup_iter_start(cgroup, &it);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ ntotal++;
+ if (is_task_frozen_enough(task))
+ nfrozen++;
+ }
+
+ /*
+ * Transition to FROZEN when no new tasks can be added ensures
+ * that we never exist in the FROZEN state while there are unfrozen
+ * tasks.
+ */
+ if (nfrozen == ntotal)
+ freezer->state = CGROUP_FROZEN;
+ else if (nfrozen > 0)
+ freezer->state = CGROUP_FREEZING;
+ else
+ freezer->state = CGROUP_THAWED;
+ cgroup_iter_end(cgroup, &it);
+}
+
+static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct freezer *freezer;
+ enum freezer_state state;
+
+ if (!cgroup_lock_live_group(cgroup))
+ return -ENODEV;
+
+ freezer = cgroup_freezer(cgroup);
+ spin_lock_irq(&freezer->lock);
+ state = freezer->state;
+ if (state == CGROUP_FREEZING) {
+ /* We change from FREEZING to FROZEN lazily if the cgroup was
+ * only partially frozen when we exitted write. */
+ update_freezer_state(cgroup, freezer);
+ state = freezer->state;
+ }
+ spin_unlock_irq(&freezer->lock);
+ cgroup_unlock();
+
+ seq_puts(m, freezer_state_strs[state]);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+ struct cgroup_iter it;
+ struct task_struct *task;
+ unsigned int num_cant_freeze_now = 0;
+
+ freezer->state = CGROUP_FREEZING;
+ cgroup_iter_start(cgroup, &it);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ if (!freeze_task(task, true))
+ continue;
+ if (is_task_frozen_enough(task))
+ continue;
+ if (!freezing(task) && !freezer_should_skip(task))
+ num_cant_freeze_now++;
+ }
+ cgroup_iter_end(cgroup, &it);
+
+ return num_cant_freeze_now ? -EBUSY : 0;
+}
+
+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+ struct cgroup_iter it;
+ struct task_struct *task;
+
+ cgroup_iter_start(cgroup, &it);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ thaw_process(task);
+ }
+ cgroup_iter_end(cgroup, &it);
+
+ freezer->state = CGROUP_THAWED;
+}
+
+static int freezer_change_state(struct cgroup *cgroup,
+ enum freezer_state goal_state)
+{
+ struct freezer *freezer;
+ int retval = 0;
+
+ freezer = cgroup_freezer(cgroup);
+
+ spin_lock_irq(&freezer->lock);
+
+ update_freezer_state(cgroup, freezer);
+ if (goal_state == freezer->state)
+ goto out;
+
+ switch (goal_state) {
+ case CGROUP_THAWED:
+ unfreeze_cgroup(cgroup, freezer);
+ break;
+ case CGROUP_FROZEN:
+ retval = try_to_freeze_cgroup(cgroup, freezer);
+ break;
+ default:
+ BUG();
+ }
+out:
+ spin_unlock_irq(&freezer->lock);
+
+ return retval;
+}
+
+static int freezer_write(struct cgroup *cgroup,
+ struct cftype *cft,
+ const char *buffer)
+{
+ int retval;
+ enum freezer_state goal_state;
+
+ if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
+ goal_state = CGROUP_THAWED;
+ else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
+ goal_state = CGROUP_FROZEN;
+ else
+ return -EINVAL;
+
+ if (!cgroup_lock_live_group(cgroup))
+ return -ENODEV;
+ retval = freezer_change_state(cgroup, goal_state);
+ cgroup_unlock();
+ return retval;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "state",
+ .read_seq_string = freezer_read,
+ .write_string = freezer_write,
+ },
+};
+
+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+ if (!cgroup->parent)
+ return 0;
+ return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
+}
+
+struct cgroup_subsys freezer_subsys = {
+ .name = "freezer",
+ .create = freezer_create,
+ .destroy = freezer_destroy,
+ .populate = freezer_populate,
+ .subsys_id = freezer_subsys_id,
+ .can_attach = freezer_can_attach,
+ .attach = NULL,
+ .fork = freezer_fork,
+ .exit = NULL,
+};
diff --git a/kernel/compat.c b/kernel/compat.c
new file mode 100644
index 0000000..8eafe3e
--- /dev/null
+++ b/kernel/compat.c
@@ -0,0 +1,1116 @@
+/*
+ * linux/kernel/compat.c
+ *
+ * Kernel compatibililty routines for e.g. 32 bit syscall support
+ * on 64 bit kernels.
+ *
+ * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/security.h>
+#include <linux/timex.h>
+#include <linux/migrate.h>
+#include <linux/posix-timers.h>
+#include <linux/times.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Note that the native side is already converted to a timespec, because
+ * that's what we want anyway.
+ */
+static int compat_get_timeval(struct timespec *o,
+ struct compat_timeval __user *i)
+{
+ long usec;
+
+ if (get_user(o->tv_sec, &i->tv_sec) ||
+ get_user(usec, &i->tv_usec))
+ return -EFAULT;
+ o->tv_nsec = usec * 1000;
+ return 0;
+}
+
+static int compat_put_timeval(struct compat_timeval __user *o,
+ struct timeval *i)
+{
+ return (put_user(i->tv_sec, &o->tv_sec) ||
+ put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
+}
+
+asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
+ struct timezone __user *tz)
+{
+ if (tv) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (compat_put_timeval(tv, &ktv))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
+ struct timezone __user *tz)
+{
+ struct timespec kts;
+ struct timezone ktz;
+
+ if (tv) {
+ if (compat_get_timeval(&kts, tv))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
+
+int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+ __get_user(ts->tv_sec, &cts->tv_sec) ||
+ __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+ __put_user(ts->tv_sec, &cts->tv_sec) ||
+ __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+ struct compat_timespec __user *rmtp;
+ struct timespec rmt;
+ mm_segment_t oldfs;
+ long ret;
+
+ restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if (ret) {
+ rmtp = restart->nanosleep.compat_rmtp;
+
+ if (rmtp && put_compat_timespec(&rmt, rmtp))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+ struct compat_timespec __user *rmtp)
+{
+ struct timespec tu, rmt;
+ mm_segment_t oldfs;
+ long ret;
+
+ if (get_compat_timespec(&tu, rqtp))
+ return -EFAULT;
+
+ if (!timespec_valid(&tu))
+ return -EINVAL;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep(&tu,
+ rmtp ? (struct timespec __user *)&rmt : NULL,
+ HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ set_fs(oldfs);
+
+ if (ret) {
+ struct restart_block *restart
+ = &current_thread_info()->restart_block;
+
+ restart->fn = compat_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+
+ if (rmtp && put_compat_timespec(&rmt, rmtp))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static inline long get_compat_itimerval(struct itimerval *o,
+ struct compat_itimerval __user *i)
+{
+ return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
+ (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
+ __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
+ __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
+ __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
+}
+
+static inline long put_compat_itimerval(struct compat_itimerval __user *o,
+ struct itimerval *i)
+{
+ return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+ (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
+ __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
+ __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
+ __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
+}
+
+asmlinkage long compat_sys_getitimer(int which,
+ struct compat_itimerval __user *it)
+{
+ struct itimerval kit;
+ int error;
+
+ error = do_getitimer(which, &kit);
+ if (!error && put_compat_itimerval(it, &kit))
+ error = -EFAULT;
+ return error;
+}
+
+asmlinkage long compat_sys_setitimer(int which,
+ struct compat_itimerval __user *in,
+ struct compat_itimerval __user *out)
+{
+ struct itimerval kin, kout;
+ int error;
+
+ if (in) {
+ if (get_compat_itimerval(&kin, in))
+ return -EFAULT;
+ } else
+ memset(&kin, 0, sizeof(kin));
+
+ error = do_setitimer(which, &kin, out ? &kout : NULL);
+ if (error || !out)
+ return error;
+ if (put_compat_itimerval(out, &kout))
+ return -EFAULT;
+ return 0;
+}
+
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+{
+ if (tbuf) {
+ struct tms tms;
+ struct compat_tms tmp;
+
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ return compat_jiffies_to_clock_t(jiffies);
+}
+
+/*
+ * Assumption: old_sigset_t and compat_old_sigset_t are both
+ * types that can be passed to put_user()/get_user().
+ */
+
+asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+{
+ old_sigset_t s;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_sigpending((old_sigset_t __user *) &s);
+ set_fs(old_fs);
+ if (ret == 0)
+ ret = put_user(s, set);
+ return ret;
+}
+
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
+ compat_old_sigset_t __user *oset)
+{
+ old_sigset_t s;
+ long ret;
+ mm_segment_t old_fs;
+
+ if (set && get_user(s, set))
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_sigprocmask(how,
+ set ? (old_sigset_t __user *) &s : NULL,
+ oset ? (old_sigset_t __user *) &s : NULL);
+ set_fs(old_fs);
+ if (ret == 0)
+ if (oset)
+ ret = put_user(s, oset);
+ return ret;
+}
+
+asmlinkage long compat_sys_setrlimit(unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs ();
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
+ __get_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __get_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+
+ if (r.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ if (r.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ set_fs(KERNEL_DS);
+ ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
+ set_fs(old_fs);
+ return ret;
+}
+
+#ifdef COMPAT_RLIM_OLD_INFINITY
+
+asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_old_getrlimit(resource, &r);
+ set_fs(old_fs);
+
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
+asmlinkage long compat_sys_getrlimit (unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
+ set_fs(old_fs);
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
+{
+ if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
+ __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
+ __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
+ __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
+ __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
+ __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
+ __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
+ __put_user(r->ru_idrss, &ru->ru_idrss) ||
+ __put_user(r->ru_isrss, &ru->ru_isrss) ||
+ __put_user(r->ru_minflt, &ru->ru_minflt) ||
+ __put_user(r->ru_majflt, &ru->ru_majflt) ||
+ __put_user(r->ru_nswap, &ru->ru_nswap) ||
+ __put_user(r->ru_inblock, &ru->ru_inblock) ||
+ __put_user(r->ru_oublock, &ru->ru_oublock) ||
+ __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
+ __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
+ __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
+ __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
+ __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
+{
+ struct rusage r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_getrusage(who, (struct rusage __user *) &r);
+ set_fs(old_fs);
+
+ if (ret)
+ return ret;
+
+ if (put_compat_rusage(&r, ru))
+ return -EFAULT;
+
+ return 0;
+}
+
+asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
+ struct compat_rusage __user *ru)
+{
+ if (!ru) {
+ return sys_wait4(pid, stat_addr, options, NULL);
+ } else {
+ struct rusage r;
+ int ret;
+ unsigned int status;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs (KERNEL_DS);
+ ret = sys_wait4(pid,
+ (stat_addr ?
+ (unsigned int __user *) &status : NULL),
+ options, (struct rusage __user *) &r);
+ set_fs (old_fs);
+
+ if (ret > 0) {
+ if (put_compat_rusage(&r, ru))
+ return -EFAULT;
+ if (stat_addr && put_user(status, stat_addr))
+ return -EFAULT;
+ }
+ return ret;
+ }
+}
+
+asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
+ struct compat_siginfo __user *uinfo, int options,
+ struct compat_rusage __user *uru)
+{
+ siginfo_t info;
+ struct rusage ru;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ memset(&info, 0, sizeof(info));
+
+ set_fs(KERNEL_DS);
+ ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
+ uru ? (struct rusage __user *)&ru : NULL);
+ set_fs(old_fs);
+
+ if ((ret < 0) || (info.si_signo == 0))
+ return ret;
+
+ if (uru) {
+ ret = put_compat_rusage(&ru, uru);
+ if (ret)
+ return ret;
+ }
+
+ BUG_ON(info.si_code & __SI_MASK);
+ info.si_code |= __SI_CHLD;
+ return copy_siginfo_to_user32(uinfo, &info);
+}
+
+static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
+ unsigned len, cpumask_t *new_mask)
+{
+ unsigned long *k;
+
+ if (len < sizeof(cpumask_t))
+ memset(new_mask, 0, sizeof(cpumask_t));
+ else if (len > sizeof(cpumask_t))
+ len = sizeof(cpumask_t);
+
+ k = cpus_addr(*new_mask);
+ return compat_get_bitmap(k, user_mask_ptr, len * 8);
+}
+
+asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
+ unsigned int len,
+ compat_ulong_t __user *user_mask_ptr)
+{
+ cpumask_t new_mask;
+ int retval;
+
+ retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+ if (retval)
+ return retval;
+
+ return sched_setaffinity(pid, &new_mask);
+}
+
+asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
+ compat_ulong_t __user *user_mask_ptr)
+{
+ int ret;
+ cpumask_t mask;
+ unsigned long *k;
+ unsigned int min_length = sizeof(cpumask_t);
+
+ if (NR_CPUS <= BITS_PER_COMPAT_LONG)
+ min_length = sizeof(compat_ulong_t);
+
+ if (len < min_length)
+ return -EINVAL;
+
+ ret = sched_getaffinity(pid, &mask);
+ if (ret < 0)
+ return ret;
+
+ k = cpus_addr(mask);
+ ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
+ if (ret)
+ return ret;
+
+ return min_length;
+}
+
+int get_compat_itimerspec(struct itimerspec *dst,
+ const struct compat_itimerspec __user *src)
+{
+ if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
+ get_compat_timespec(&dst->it_value, &src->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+int put_compat_itimerspec(struct compat_itimerspec __user *dst,
+ const struct itimerspec *src)
+{
+ if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
+ put_compat_timespec(&src->it_value, &dst->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+long compat_sys_timer_create(clockid_t which_clock,
+ struct compat_sigevent __user *timer_event_spec,
+ timer_t __user *created_timer_id)
+{
+ struct sigevent __user *event = NULL;
+
+ if (timer_event_spec) {
+ struct sigevent kevent;
+
+ event = compat_alloc_user_space(sizeof(*event));
+ if (get_compat_sigevent(&kevent, timer_event_spec) ||
+ copy_to_user(event, &kevent, sizeof(*event)))
+ return -EFAULT;
+ }
+
+ return sys_timer_create(which_clock, event, created_timer_id);
+}
+
+long compat_sys_timer_settime(timer_t timer_id, int flags,
+ struct compat_itimerspec __user *new,
+ struct compat_itimerspec __user *old)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec newts, oldts;
+
+ if (!new)
+ return -EINVAL;
+ if (get_compat_itimerspec(&newts, new))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_settime(timer_id, flags,
+ (struct itimerspec __user *) &newts,
+ (struct itimerspec __user *) &oldts);
+ set_fs(oldfs);
+ if (!err && old && put_compat_itimerspec(old, &oldts))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_timer_gettime(timer_t timer_id,
+ struct compat_itimerspec __user *setting)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_gettime(timer_id,
+ (struct itimerspec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && put_compat_itimerspec(setting, &ts))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_clock_settime(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ if (get_compat_timespec(&ts, tp))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_settime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ return err;
+}
+
+long compat_sys_clock_gettime(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_gettime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && put_compat_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_clock_getres(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_getres(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && tp && put_compat_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec tu;
+ struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+
+ restart->nanosleep.rmtp = (struct timespec __user *) &tu;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = clock_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&tu, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+ }
+ return err;
+}
+
+long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
+ struct compat_timespec __user *rqtp,
+ struct compat_timespec __user *rmtp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec in, out;
+ struct restart_block *restart;
+
+ if (get_compat_timespec(&in, rqtp))
+ return -EFAULT;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_nanosleep(which_clock, flags,
+ (struct timespec __user *) &in,
+ (struct timespec __user *) &out);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&out, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+ }
+ return err;
+}
+
+/*
+ * We currently only need the following fields from the sigevent
+ * structure: sigev_value, sigev_signo, sig_notify and (sometimes
+ * sigev_notify_thread_id). The others are handled in user mode.
+ * We also assume that copying sigev_value.sival_int is sufficient
+ * to keep all the bits of sigev_value.sival_ptr intact.
+ */
+int get_compat_sigevent(struct sigevent *event,
+ const struct compat_sigevent __user *u_event)
+{
+ memset(event, 0, sizeof(*event));
+ return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+ __get_user(event->sigev_value.sival_int,
+ &u_event->sigev_value.sival_int) ||
+ __get_user(event->sigev_signo, &u_event->sigev_signo) ||
+ __get_user(event->sigev_notify, &u_event->sigev_notify) ||
+ __get_user(event->sigev_notify_thread_id,
+ &u_event->sigev_notify_thread_id))
+ ? -EFAULT : 0;
+}
+
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = 0;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ /*
+ * We dont want to read past the end of the userspace
+ * bitmap. We must however ensure the end of the
+ * kernel bitmap is zeroed.
+ */
+ if (nr_compat_longs-- > 0) {
+ if (__get_user(um, umask))
+ return -EFAULT;
+ } else {
+ um = 0;
+ }
+
+ umask++;
+ m |= (long)um << (j * BITS_PER_COMPAT_LONG);
+ }
+ *mask++ = m;
+ }
+
+ return 0;
+}
+
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = *mask++;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ um = m;
+
+ /*
+ * We dont want to write past the end of the userspace
+ * bitmap.
+ */
+ if (nr_compat_longs-- > 0) {
+ if (__put_user(um, umask))
+ return -EFAULT;
+ }
+
+ umask++;
+ m >>= 4*sizeof(um);
+ m >>= 4*sizeof(um);
+ }
+ }
+
+ return 0;
+}
+
+void
+sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+{
+ switch (_NSIG_WORDS) {
+ case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
+ case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
+ case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
+ case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ }
+}
+
+asmlinkage long
+compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+ struct compat_siginfo __user *uinfo,
+ struct compat_timespec __user *uts, compat_size_t sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ int sig;
+ struct timespec t;
+ siginfo_t info;
+ long ret, timeout = 0;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+ sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
+ signotset(&s);
+
+ if (uts) {
+ if (get_compat_timespec (&t, uts))
+ return -EFAULT;
+ if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
+ || t.tv_sec < 0)
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &s, &info);
+ if (!sig) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ if (uts)
+ timeout = timespec_to_jiffies(&t)
+ +(t.tv_sec || t.tv_nsec);
+ if (timeout) {
+ current->real_blocked = current->blocked;
+ sigandsets(&current->blocked, &current->blocked, &s);
+
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &s, &info);
+ current->blocked = current->real_blocked;
+ siginitset(&current->real_blocked, 0);
+ recalc_sigpending();
+ }
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (sig) {
+ ret = sig;
+ if (uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+ }else {
+ ret = timeout?-EINTR:-EAGAIN;
+ }
+ return ret;
+
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+
+asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+{
+ compat_time_t i;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ i = tv.tv_sec;
+
+ if (tloc) {
+ if (put_user(i,tloc))
+ i = -EFAULT;
+ }
+ return i;
+}
+
+asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+{
+ struct timespec tv;
+ int err;
+
+ if (get_user(tv.tv_sec, tptr))
+ return -EFAULT;
+
+ tv.tv_nsec = 0;
+
+ err = security_settime(&tv, NULL);
+ if (err)
+ return err;
+
+ do_settimeofday(&tv);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+
+#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
+{
+ sigset_t newset;
+ compat_sigset_t newset32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sighand->siglock);
+ current->saved_sigmask = current->blocked;
+ current->blocked = newset;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+ return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
+
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+{
+ struct timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc.modes, &utp->modes) ||
+ __get_user(txc.offset, &utp->offset) ||
+ __get_user(txc.freq, &utp->freq) ||
+ __get_user(txc.maxerror, &utp->maxerror) ||
+ __get_user(txc.esterror, &utp->esterror) ||
+ __get_user(txc.status, &utp->status) ||
+ __get_user(txc.constant, &utp->constant) ||
+ __get_user(txc.precision, &utp->precision) ||
+ __get_user(txc.tolerance, &utp->tolerance) ||
+ __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc.tick, &utp->tick) ||
+ __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc.jitter, &utp->jitter) ||
+ __get_user(txc.shift, &utp->shift) ||
+ __get_user(txc.stabil, &utp->stabil) ||
+ __get_user(txc.jitcnt, &utp->jitcnt) ||
+ __get_user(txc.calcnt, &utp->calcnt) ||
+ __get_user(txc.errcnt, &utp->errcnt) ||
+ __get_user(txc.stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ ret = do_adjtimex(&txc);
+
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc.modes, &utp->modes) ||
+ __put_user(txc.offset, &utp->offset) ||
+ __put_user(txc.freq, &utp->freq) ||
+ __put_user(txc.maxerror, &utp->maxerror) ||
+ __put_user(txc.esterror, &utp->esterror) ||
+ __put_user(txc.status, &utp->status) ||
+ __put_user(txc.constant, &utp->constant) ||
+ __put_user(txc.precision, &utp->precision) ||
+ __put_user(txc.tolerance, &utp->tolerance) ||
+ __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc.tick, &utp->tick) ||
+ __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc.jitter, &utp->jitter) ||
+ __put_user(txc.shift, &utp->shift) ||
+ __put_user(txc.stabil, &utp->stabil) ||
+ __put_user(txc.jitcnt, &utp->jitcnt) ||
+ __put_user(txc.calcnt, &utp->calcnt) ||
+ __put_user(txc.errcnt, &utp->errcnt) ||
+ __put_user(txc.stbcnt, &utp->stbcnt) ||
+ __put_user(txc.tai, &utp->tai))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+ compat_uptr_t __user *pages32,
+ const int __user *nodes,
+ int __user *status,
+ int flags)
+{
+ const void __user * __user *pages;
+ int i;
+
+ pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+ for (i = 0; i < nr_pages; i++) {
+ compat_uptr_t p;
+
+ if (get_user(p, pages32 + i) ||
+ put_user(compat_ptr(p), pages + i))
+ return -EFAULT;
+ }
+ return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+ compat_ulong_t maxnode,
+ const compat_ulong_t __user *old_nodes,
+ const compat_ulong_t __user *new_nodes)
+{
+ unsigned long __user *old = NULL;
+ unsigned long __user *new = NULL;
+ nodemask_t tmp_mask;
+ unsigned long nr_bits;
+ unsigned long size;
+
+ nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+ size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+ if (old_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+ return -EFAULT;
+ old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+ if (new_nodes)
+ new = old + size / sizeof(unsigned long);
+ if (copy_to_user(old, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ if (new_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+ return -EFAULT;
+ if (new == NULL)
+ new = compat_alloc_user_space(size);
+ if (copy_to_user(new, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
+#endif
+
+struct compat_sysinfo {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ u16 procs;
+ u16 pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+asmlinkage long
+compat_sys_sysinfo(struct compat_sysinfo __user *info)
+{
+ struct sysinfo s;
+
+ do_sysinfo(&s);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ int bitcount = 0;
+
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
+ }
+
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ __put_user (s.uptime, &info->uptime) ||
+ __put_user (s.loads[0], &info->loads[0]) ||
+ __put_user (s.loads[1], &info->loads[1]) ||
+ __put_user (s.loads[2], &info->loads[2]) ||
+ __put_user (s.totalram, &info->totalram) ||
+ __put_user (s.freeram, &info->freeram) ||
+ __put_user (s.sharedram, &info->sharedram) ||
+ __put_user (s.bufferram, &info->bufferram) ||
+ __put_user (s.totalswap, &info->totalswap) ||
+ __put_user (s.freeswap, &info->freeswap) ||
+ __put_user (s.procs, &info->procs) ||
+ __put_user (s.totalhigh, &info->totalhigh) ||
+ __put_user (s.freehigh, &info->freehigh) ||
+ __put_user (s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/kernel/configs.c b/kernel/configs.c
new file mode 100644
index 0000000..abaee68
--- /dev/null
+++ b/kernel/configs.c
@@ -0,0 +1,98 @@
+/*
+ * kernel/configs.c
+ * Echo the kernel .config file used to build the kernel
+ *
+ * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
+ * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
+ * Copyright (C) 2002 Hewlett-Packard Company
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/**************************************************/
+/* the actual current config file */
+
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file. The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ * IKCFG_ST
+ * <image>
+ * IKCFG_ED
+ */
+#define MAGIC_START "IKCFG_ST"
+#define MAGIC_END "IKCFG_ED"
+#include "config_data.h"
+
+
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+ (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
+
+static ssize_t
+ikconfig_read_current(struct file *file, char __user *buf,
+ size_t len, loff_t * offset)
+{
+ return simple_read_from_buffer(buf, len, offset,
+ kernel_config_data + MAGIC_SIZE,
+ kernel_config_data_size);
+}
+
+static const struct file_operations ikconfig_file_ops = {
+ .owner = THIS_MODULE,
+ .read = ikconfig_read_current,
+};
+
+static int __init ikconfig_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ /* create the current config file */
+ entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
+ &ikconfig_file_ops);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->size = kernel_config_data_size;
+
+ return 0;
+}
+
+static void __exit ikconfig_cleanup(void)
+{
+ remove_proc_entry("config.gz", NULL);
+}
+
+module_init(ikconfig_init);
+module_exit(ikconfig_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Randy Dunlap");
+MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
+
+#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
new file mode 100644
index 0000000..8ea32e8
--- /dev/null
+++ b/kernel/cpu.c
@@ -0,0 +1,504 @@
+/* CPU control.
+ * (C) 2001, 2002, 2003, 2004 Rusty Russell
+ *
+ * This code is licenced under the GPL.
+ */
+#include <linux/proc_fs.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/stop_machine.h>
+#include <linux/mutex.h>
+
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+
+#else /* CONFIG_SMP */
+
+/* Serializes the updates to cpu_online_map, cpu_present_map */
+static DEFINE_MUTEX(cpu_add_remove_lock);
+
+static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
+static struct {
+ struct task_struct *active_writer;
+ struct mutex lock; /* Synchronizes accesses to refcount, */
+ /*
+ * Also blocks the new readers during
+ * an ongoing cpu hotplug operation.
+ */
+ int refcount;
+} cpu_hotplug;
+
+void __init cpu_hotplug_init(void)
+{
+ cpu_hotplug.active_writer = NULL;
+ mutex_init(&cpu_hotplug.lock);
+ cpu_hotplug.refcount = 0;
+}
+
+cpumask_t cpu_active_map;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void get_online_cpus(void)
+{
+ might_sleep();
+ if (cpu_hotplug.active_writer == current)
+ return;
+ mutex_lock(&cpu_hotplug.lock);
+ cpu_hotplug.refcount++;
+ mutex_unlock(&cpu_hotplug.lock);
+
+}
+EXPORT_SYMBOL_GPL(get_online_cpus);
+
+void put_online_cpus(void)
+{
+ if (cpu_hotplug.active_writer == current)
+ return;
+ mutex_lock(&cpu_hotplug.lock);
+ if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
+ wake_up_process(cpu_hotplug.active_writer);
+ mutex_unlock(&cpu_hotplug.lock);
+
+}
+EXPORT_SYMBOL_GPL(put_online_cpus);
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_hotplug_begin() is always called after invoking
+ * cpu_maps_update_begin(), we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ * writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ * non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * get_online_cpus() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+ cpu_hotplug.active_writer = current;
+
+ for (;;) {
+ mutex_lock(&cpu_hotplug.lock);
+ if (likely(!cpu_hotplug.refcount))
+ break;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&cpu_hotplug.lock);
+ schedule();
+ }
+}
+
+static void cpu_hotplug_done(void)
+{
+ cpu_hotplug.active_writer = NULL;
+ mutex_unlock(&cpu_hotplug.lock);
+}
+/* Need to know about CPUs going up/down? */
+int __ref register_cpu_notifier(struct notifier_block *nb)
+{
+ int ret;
+ cpu_maps_update_begin();
+ ret = raw_notifier_chain_register(&cpu_chain, nb);
+ cpu_maps_update_done();
+ return ret;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+EXPORT_SYMBOL(register_cpu_notifier);
+
+void __ref unregister_cpu_notifier(struct notifier_block *nb)
+{
+ cpu_maps_update_begin();
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+ cpu_maps_update_done();
+}
+EXPORT_SYMBOL(unregister_cpu_notifier);
+
+static inline void check_for_tasks(int cpu)
+{
+ struct task_struct *p;
+
+ write_lock_irq(&tasklist_lock);
+ for_each_process(p) {
+ if (task_cpu(p) == cpu &&
+ (!cputime_eq(p->utime, cputime_zero) ||
+ !cputime_eq(p->stime, cputime_zero)))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+ (state = %ld, flags = %x) \n",
+ p->comm, task_pid_nr(p), cpu,
+ p->state, p->flags);
+ }
+ write_unlock_irq(&tasklist_lock);
+}
+
+struct take_cpu_down_param {
+ unsigned long mod;
+ void *hcpu;
+};
+
+/* Take this CPU down. */
+static int __ref take_cpu_down(void *_param)
+{
+ struct take_cpu_down_param *param = _param;
+ int err;
+
+ /* Ensure this CPU doesn't handle any more interrupts. */
+ err = __cpu_disable();
+ if (err < 0)
+ return err;
+
+ raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+ param->hcpu);
+
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();
+ return 0;
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+{
+ int err, nr_calls = 0;
+ cpumask_t old_allowed, tmp;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct take_cpu_down_param tcd_param = {
+ .mod = mod,
+ .hcpu = hcpu,
+ };
+
+ if (num_online_cpus() == 1)
+ return -EBUSY;
+
+ if (!cpu_online(cpu))
+ return -EINVAL;
+
+ cpu_hotplug_begin();
+ err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+ hcpu, -1, &nr_calls);
+ if (err == NOTIFY_BAD) {
+ nr_calls--;
+ __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu, nr_calls, NULL);
+ printk("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
+ err = -EINVAL;
+ goto out_release;
+ }
+
+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ cpus_setall(tmp);
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed_ptr(current, &tmp);
+ tmp = cpumask_of_cpu(cpu);
+
+ err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+ if (err) {
+ /* CPU didn't die: tell everyone. Can't complain. */
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu) == NOTIFY_BAD)
+ BUG();
+
+ goto out_allowed;
+ }
+ BUG_ON(cpu_online(cpu));
+
+ /* Wait for it to sleep (leaving idle task). */
+ while (!idle_cpu(cpu))
+ yield();
+
+ /* This actually kills the CPU. */
+ __cpu_die(cpu);
+
+ /* CPU is completely dead: tell everyone. Too late to complain. */
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
+ BUG();
+
+ check_for_tasks(cpu);
+
+out_allowed:
+ set_cpus_allowed_ptr(current, &old_allowed);
+out_release:
+ cpu_hotplug_done();
+ if (!err) {
+ if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
+ BUG();
+ }
+ return err;
+}
+
+int __ref cpu_down(unsigned int cpu)
+{
+ int err = 0;
+
+ cpu_maps_update_begin();
+
+ if (cpu_hotplug_disabled) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ cpu_clear(cpu, cpu_active_map);
+
+ /*
+ * Make sure the all cpus did the reschedule and are not
+ * using stale version of the cpu_active_map.
+ * This is not strictly necessary becuase stop_machine()
+ * that we run down the line already provides the required
+ * synchronization. But it's really a side effect and we do not
+ * want to depend on the innards of the stop_machine here.
+ */
+ synchronize_sched();
+
+ err = _cpu_down(cpu, 0);
+
+ if (cpu_online(cpu))
+ cpu_set(cpu, cpu_active_map);
+
+out:
+ cpu_maps_update_done();
+ return err;
+}
+EXPORT_SYMBOL(cpu_down);
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+/* Requires cpu_add_remove_lock to be held */
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+{
+ int ret, nr_calls = 0;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+
+ if (cpu_online(cpu) || !cpu_present(cpu))
+ return -EINVAL;
+
+ cpu_hotplug_begin();
+ ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+ -1, &nr_calls);
+ if (ret == NOTIFY_BAD) {
+ nr_calls--;
+ printk("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
+ ret = -EINVAL;
+ goto out_notify;
+ }
+
+ /* Arch-specific enabling code. */
+ ret = __cpu_up(cpu);
+ if (ret != 0)
+ goto out_notify;
+ BUG_ON(!cpu_online(cpu));
+
+ cpu_set(cpu, cpu_active_map);
+
+ /* Now call notifier in preparation. */
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+
+out_notify:
+ if (ret != 0)
+ __raw_notifier_call_chain(&cpu_chain,
+ CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ cpu_hotplug_done();
+
+ return ret;
+}
+
+int __cpuinit cpu_up(unsigned int cpu)
+{
+ int err = 0;
+ if (!cpu_isset(cpu, cpu_possible_map)) {
+ printk(KERN_ERR "can't online cpu %d because it is not "
+ "configured as may-hotadd at boot time\n", cpu);
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+ printk(KERN_ERR "please check additional_cpus= boot "
+ "parameter\n");
+#endif
+ return -EINVAL;
+ }
+
+ cpu_maps_update_begin();
+
+ if (cpu_hotplug_disabled) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = _cpu_up(cpu, 0);
+
+out:
+ cpu_maps_update_done();
+ return err;
+}
+
+#ifdef CONFIG_PM_SLEEP_SMP
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+ int cpu, first_cpu, error = 0;
+
+ cpu_maps_update_begin();
+ first_cpu = first_cpu(cpu_online_map);
+ /* We take down all of the non-boot CPUs in one shot to avoid races
+ * with the userspace trying to use the CPU hotplug at the same time
+ */
+ cpus_clear(frozen_cpus);
+ printk("Disabling non-boot CPUs ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == first_cpu)
+ continue;
+ error = _cpu_down(cpu, 1);
+ if (!error) {
+ cpu_set(cpu, frozen_cpus);
+ printk("CPU%d is down\n", cpu);
+ } else {
+ printk(KERN_ERR "Error taking CPU%d down: %d\n",
+ cpu, error);
+ break;
+ }
+ }
+ if (!error) {
+ BUG_ON(num_online_cpus() > 1);
+ /* Make sure the CPUs won't be enabled by someone else */
+ cpu_hotplug_disabled = 1;
+ } else {
+ printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ }
+ cpu_maps_update_done();
+ return error;
+}
+
+void __ref enable_nonboot_cpus(void)
+{
+ int cpu, error;
+
+ /* Allow everyone to use the CPU hotplug again */
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ if (cpus_empty(frozen_cpus))
+ goto out;
+
+ printk("Enabling non-boot CPUs ...\n");
+ for_each_cpu_mask_nr(cpu, frozen_cpus) {
+ error = _cpu_up(cpu, 1);
+ if (!error) {
+ printk("CPU%d is up\n", cpu);
+ continue;
+ }
+ printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ }
+ cpus_clear(frozen_cpus);
+out:
+ cpu_maps_update_done();
+}
+#endif /* CONFIG_PM_SLEEP_SMP */
+
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void __cpuinit notify_cpu_starting(unsigned int cpu)
+{
+ unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+ if (cpu_isset(cpu, frozen_cpus))
+ val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
+
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
+
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+ MASK_DECLARE_8(0), MASK_DECLARE_8(8),
+ MASK_DECLARE_8(16), MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+ MASK_DECLARE_8(32), MASK_DECLARE_8(40),
+ MASK_DECLARE_8(48), MASK_DECLARE_8(56),
+#endif
+};
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
new file mode 100644
index 0000000..96c0ba1
--- /dev/null
+++ b/kernel/cpuset.c
@@ -0,0 +1,2467 @@
+/*
+ * kernel/cpuset.c
+ *
+ * Processor and Memory placement constraints for sets of tasks.
+ *
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2007 Silicon Graphics, Inc.
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * 2006 Rework by Paul Menage to use generic cgroups
+ * 2008 Rework of the scheduler domains and CPU hotplug handling
+ * by Max Krasnyansky
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/memory.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
+
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets __read_mostly;
+
+/* Forward declare cgroup structures */
+struct cgroup_subsys cpuset_subsys;
+struct cpuset;
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
+ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ struct cpuset *parent; /* my parent */
+
+ /*
+ * Copy of global cpuset_mems_generation as of the most
+ * recent time this cpuset changed its mems_allowed.
+ */
+ int mems_generation;
+
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /* partition number for rebuild_sched_domains() */
+ int pn;
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
+ /* used for walking a cpuset heirarchy */
+ struct list_head stack_list;
+};
+
+/* Retrieve the cpuset for a cgroup */
+static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
+ struct cpuset, css);
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, cpuset_subsys_id),
+ struct cpuset, css);
+}
+struct cpuset_hotplug_scanner {
+ struct cgroup_scanner scan;
+ struct cgroup *to;
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+/*
+ * Increment this integer everytime any cpuset changes its
+ * mems_allowed value. Users of cpusets can track this generation
+ * number, and avoid having to lock and reload mems_allowed unless
+ * the cpuset they're using changes generation.
+ *
+ * A single, global generation is needed because cpuset_attach_task() could
+ * reattach a task to a different cpuset, which must not have its
+ * generation numbers aliased with those of that tasks previous cpuset.
+ *
+ * Generations are needed for mems_allowed because one task cannot
+ * modify another's memory placement. So we must enable every task,
+ * on every visit to __alloc_pages(), to efficiently check whether
+ * its current->cpuset->mems_allowed has changed, requiring an update
+ * of its current->mems_allowed.
+ *
+ * Since writes to cpuset_mems_generation are guarded by the cgroup lock
+ * there is no need to mark it atomic.
+ */
+static int cpuset_mems_generation;
+
+static struct cpuset top_cpuset = {
+ .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .cpus_allowed = CPU_MASK_ALL,
+ .mems_allowed = NODE_MASK_ALL,
+};
+
+/*
+ * There are two global mutexes guarding cpuset structures. The first
+ * is the main control groups cgroup_mutex, accessed via
+ * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
+ * callback_mutex, below. They can nest. It is ok to first take
+ * cgroup_mutex, then nest callback_mutex. We also require taking
+ * task_lock() when dereferencing a task's cpuset pointer. See "The
+ * task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task
+ * holds cgroup_mutex, then it blocks others wanting that mutex,
+ * ensuring that it is the only task able to also acquire callback_mutex
+ * and be able to modify cpusets. It can perform various checks on
+ * the cpuset structure first, knowing nothing will change. It can
+ * also allocate memory while just holding cgroup_mutex. While it is
+ * performing these checks, various callback routines can briefly
+ * acquire callback_mutex to query cpusets. Once it is ready to make
+ * the changes, it takes callback_mutex, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_mutex, as that would risk double tripping on callback_mutex
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_mutex, then it has read-only
+ * access to cpusets.
+ *
+ * The task_struct fields mems_allowed and mems_generation may only
+ * be accessed in the context of that task, so require no locks.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * Accessing a task's cpuset should be done in accordance with the
+ * guidelines for accessing subsystem state in kernel/cgroup.c
+ */
+
+static DEFINE_MUTEX(callback_mutex);
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static int cpuset_get_sb(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ struct file_system_type *cgroup_fs = get_fs_type("cgroup");
+ int ret = -ENODEV;
+ if (cgroup_fs) {
+ char mountopts[] =
+ "cpuset,noprefix,"
+ "release_agent=/sbin/cpuset_release_agent";
+ ret = cgroup_fs->get_sb(cgroup_fs, flags,
+ unused_dev_name, mountopts, mnt);
+ put_filesystem(cgroup_fs);
+ }
+ return ret;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .get_sb = cpuset_get_sb,
+};
+
+/*
+ * Return in *pmask the portion of a cpusets's cpus_allowed that
+ * are online. If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online cpus. If we get
+ * all the way to the top and still haven't found any online cpus,
+ * return cpu_online_map. Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_map.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_online_map.
+ *
+ * Call with callback_mutex held.
+ */
+
+static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+{
+ while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
+ cs = cs->parent;
+ if (cs)
+ cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
+ else
+ *pmask = cpu_online_map;
+ BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online, with memory. If none are online with memory, walk
+ * up the cpuset hierarchy until we find one that does have some
+ * online mems. If we get all the way to the top and still haven't
+ * found any online mems, return node_states[N_HIGH_MEMORY].
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_states[N_HIGH_MEMORY].
+ *
+ * Call with callback_mutex held.
+ */
+
+static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+{
+ while (cs && !nodes_intersects(cs->mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ cs = cs->parent;
+ if (cs)
+ nodes_and(*pmask, cs->mems_allowed,
+ node_states[N_HIGH_MEMORY]);
+ else
+ *pmask = node_states[N_HIGH_MEMORY];
+ BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+}
+
+/**
+ * cpuset_update_task_memory_state - update task memory placement
+ *
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_mutex or task_lock() held. May be
+ * called with or without cgroup_mutex held. Thanks in part to
+ * 'the_top_cpuset_hack', the task's cpuset pointer will never
+ * be NULL. This routine also might acquire callback_mutex during
+ * call.
+ *
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation. However this really only
+ * matters on alpha systems using cpusets heavily. If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant. Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
+ *
+ * This routine is needed to update the per-task mems_allowed data,
+ * within the tasks context, when it is trying to allocate memory
+ * (in various mm/mempolicy.c routines) and notices that some other
+ * task has been modifying its cpuset.
+ */
+
+void cpuset_update_task_memory_state(void)
+{
+ int my_cpusets_mem_gen;
+ struct task_struct *tsk = current;
+ struct cpuset *cs;
+
+ if (task_cs(tsk) == &top_cpuset) {
+ /* Don't need rcu for top_cpuset. It's never freed. */
+ my_cpusets_mem_gen = top_cpuset.mems_generation;
+ } else {
+ rcu_read_lock();
+ my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
+ rcu_read_unlock();
+ }
+
+ if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+ mutex_lock(&callback_mutex);
+ task_lock(tsk);
+ cs = task_cs(tsk); /* Maybe changed when task not locked */
+ guarantee_online_mems(cs, &tsk->mems_allowed);
+ tsk->cpuset_mems_generation = cs->mems_generation;
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
+ task_unlock(tsk);
+ mutex_unlock(&callback_mutex);
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
+ }
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set. Call holding cgroup_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ * follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid? Presumes
+ * cgroup_mutex held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset. Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+ struct cgroup *cont;
+ struct cpuset *c, *par;
+
+ /* Each of our child cpusets must be a subset of us */
+ list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
+ if (!is_cpuset_subset(cgroup_cs(cont), trial))
+ return -EBUSY;
+ }
+
+ /* Remaining checks don't apply to root cpuset */
+ if (cur == &top_cpuset)
+ return 0;
+
+ par = cur->parent;
+
+ /* We must be a subset of our parent cpuset */
+ if (!is_cpuset_subset(trial, par))
+ return -EACCES;
+
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
+ c = cgroup_cs(cont);
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
+ return -EINVAL;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ return -EINVAL;
+ }
+
+ /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
+ if (cgroup_task_count(cur->css.cgroup)) {
+ if (cpus_empty(trial->cpus_allowed) ||
+ nodes_empty(trial->mems_allowed)) {
+ return -ENOSPC;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping cpus_allowed masks?
+ */
+static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+ return;
+}
+
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ LIST_HEAD(q);
+
+ list_add(&c->stack_list, &q);
+ while (!list_empty(&q)) {
+ struct cpuset *cp;
+ struct cgroup *cont;
+ struct cpuset *child;
+
+ cp = list_first_entry(&q, struct cpuset, stack_list);
+ list_del(q.next);
+
+ if (cpus_empty(cp->cpus_allowed))
+ continue;
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, &q);
+ }
+ }
+}
+
+/*
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
+ *
+ * See "What is sched_load_balance" in Documentation/cpusets.txt
+ * for a background explanation of this.
+ *
+ * Does not return errors, on the theory that the callers of this
+ * routine would rather not worry about failures to rebuild sched
+ * domains when operating in the severe memory shortage situations
+ * that could cause allocation failures below.
+ *
+ * Must be called with cgroup_lock held.
+ *
+ * The three key local variables below are:
+ * q - a linked-list queue of cpuset pointers, used to implement a
+ * top-down scan of all cpusets. This scan loads a pointer
+ * to each cpuset marked is_sched_load_balance into the
+ * array 'csa'. For our purposes, rebuilding the schedulers
+ * sched domains, we can ignore !is_sched_load_balance cpusets.
+ * csa - (for CpuSet Array) Array of pointers to all the cpusets
+ * that need to be load balanced, for convenient iterative
+ * access by the subsequent code that finds the best partition,
+ * i.e the set of domains (subsets) of CPUs such that the
+ * cpus_allowed of every cpuset marked is_sched_load_balance
+ * is a subset of one of these domains, while there are as
+ * many such domains as possible, each as small as possible.
+ * doms - Conversion of 'csa' to an array of cpumasks, for passing to
+ * the kernel/sched.c routine partition_sched_domains() in a
+ * convenient format, that can be easily compared to the prior
+ * value to determine what partition elements (sched domains)
+ * were changed (added or removed.)
+ *
+ * Finding the best partition (set of domains):
+ * The triple nested loops below over i, j, k scan over the
+ * load balanced cpusets (using the array of cpuset pointers in
+ * csa[]) looking for pairs of cpusets that have overlapping
+ * cpus_allowed, but which don't have the same 'pn' partition
+ * number and gives them in the same partition number. It keeps
+ * looping on the 'restart' label until it can no longer find
+ * any such pairs.
+ *
+ * The union of the cpus_allowed masks from the set of
+ * all cpusets having the same 'pn' value then form the one
+ * element of the partition (one sched domain) to be passed to
+ * partition_sched_domains().
+ */
+static int generate_sched_domains(cpumask_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ LIST_HEAD(q); /* queue of cpusets to be scanned */
+ struct cpuset *cp; /* scans q */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j, k; /* indices for partition finding loops */
+ cpumask_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] cpumask_t slot */
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ *doms = top_cpuset.cpus_allowed;
+
+ ndoms = 1;
+ goto done;
+ }
+
+ csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ list_add(&top_cpuset.stack_list, &q);
+ while (!list_empty(&q)) {
+ struct cgroup *cont;
+ struct cpuset *child; /* scans child cpusets of cp */
+
+ cp = list_first_entry(&q, struct cpuset, stack_list);
+ list_del(q.next);
+
+ if (cpus_empty(cp->cpus_allowed))
+ continue;
+
+ /*
+ * All child cpusets contain a subset of the parent's cpus, so
+ * just skip them, and then we call update_domain_attr_tree()
+ * to calc relax_domain_level of the corresponding sched
+ * domain.
+ */
+ if (is_sched_load_balance(cp)) {
+ csa[csn++] = cp;
+ continue;
+ }
+
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, &q);
+ }
+ }
+
+ for (i = 0; i < csn; i++)
+ csa[i]->pn = i;
+ ndoms = csn;
+
+restart:
+ /* Find the best partition (set of sched domains) */
+ for (i = 0; i < csn; i++) {
+ struct cpuset *a = csa[i];
+ int apn = a->pn;
+
+ for (j = 0; j < csn; j++) {
+ struct cpuset *b = csa[j];
+ int bpn = b->pn;
+
+ if (apn != bpn && cpusets_overlap(a, b)) {
+ for (k = 0; k < csn; k++) {
+ struct cpuset *c = csa[k];
+
+ if (c->pn == bpn)
+ c->pn = apn;
+ }
+ ndoms--; /* one less element */
+ goto restart;
+ }
+ }
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ struct cpuset *a = csa[i];
+ cpumask_t *dp;
+ int apn = a->pn;
+
+ if (apn < 0) {
+ /* Skip completed partitions */
+ continue;
+ }
+
+ dp = doms + nslot;
+
+ if (nslot == ndoms) {
+ static int warnings = 10;
+ if (warnings) {
+ printk(KERN_WARNING
+ "rebuild_sched_domains confused:"
+ " nslot %d, ndoms %d, csn %d, i %d,"
+ " apn %d\n",
+ nslot, ndoms, csn, i, apn);
+ warnings--;
+ }
+ continue;
+ }
+
+ cpus_clear(*dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ for (j = i; j < csn; j++) {
+ struct cpuset *b = csa[j];
+
+ if (apn == b->pn) {
+ cpus_or(*dp, *dp, b->cpus_allowed);
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, b);
+
+ /* Done with this partition */
+ b->pn = -1;
+ }
+ }
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
+ get_online_cpus();
+
+ /* Generate domain masks and attrs */
+ cgroup_lock();
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+
+ put_online_cpus();
+}
+
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
+
+/*
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+ schedule_work(&rebuild_sched_domains_work);
+}
+
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+ do_rebuild_sched_domains(NULL);
+}
+
+/**
+ * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Called for each task in a cgroup by cgroup_scan_tasks().
+ * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
+ * words, if its mask is not equal to its cpuset's mask).
+ */
+static int cpuset_test_cpumask(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ return !cpus_equal(tsk->cpus_allowed,
+ (cgroup_cs(scan->cg))->cpus_allowed);
+}
+
+/**
+ * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup whose
+ * cpus_allowed mask needs to be changed.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_cpumask(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
+}
+
+/**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+{
+ struct cgroup_scanner scan;
+
+ scan.cg = cs->css.cgroup;
+ scan.test_task = cpuset_test_cpumask;
+ scan.process_task = cpuset_change_cpumask;
+ scan.heap = heap;
+ cgroup_scan_tasks(&scan);
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, const char *buf)
+{
+ struct ptr_heap heap;
+ struct cpuset trialcs;
+ int retval;
+ int is_load_balanced;
+
+ /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
+ trialcs = *cs;
+
+ /*
+ * An empty cpus_allowed is ok only if the cpuset has no tasks.
+ * Since cpulist_parse() fails on an empty mask, we special case
+ * that parsing. The validate_change() call ensures that cpusets
+ * with tasks have cpus.
+ */
+ if (!*buf) {
+ cpus_clear(trialcs.cpus_allowed);
+ } else {
+ retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+ return -EINVAL;
+ }
+ retval = validate_change(cs, &trialcs);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+ return 0;
+
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (retval)
+ return retval;
+
+ is_load_balanced = is_sched_load_balance(&trialcs);
+
+ mutex_lock(&callback_mutex);
+ cs->cpus_allowed = trialcs.cpus_allowed;
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * Scan tasks in the cpuset, and update the cpumasks of any
+ * that need an update.
+ */
+ update_tasks_cpumask(cs, &heap);
+
+ heap_free(&heap);
+
+ if (is_load_balanced)
+ async_rebuild_sched_domains();
+ return 0;
+}
+
+/*
+ * cpuset_migrate_mm
+ *
+ * Migrate memory region from one set of nodes to another.
+ *
+ * Temporarilly set tasks mems_allowed to target nodes of migration,
+ * so that the migration code can allocate pages on these nodes.
+ *
+ * Call holding cgroup_mutex, so current's cpuset won't change
+ * during this call, as manage_mutex holds off any cpuset_attach()
+ * calls. Therefore we don't need to take task_lock around the
+ * call to guarantee_online_mems(), as we know no one is changing
+ * our task's cpuset.
+ *
+ * Hold callback_mutex around the two modifications of our tasks
+ * mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ * While the mm_struct we are migrating is typically from some
+ * other task, the task_struct mems_allowed that we are hacking
+ * is for our current task, which must allocate new pages for that
+ * migrating memory region.
+ *
+ * We call cpuset_update_task_memory_state() before hacking
+ * our tasks mems_allowed, so that we are assured of being in
+ * sync with our tasks cpuset, and in particular, callbacks to
+ * cpuset_update_task_memory_state() from nested page allocations
+ * won't see any mismatch of our cpuset and task mems_generation
+ * values, so won't overwrite our hacked tasks mems_allowed
+ * nodemask.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to)
+{
+ struct task_struct *tsk = current;
+
+ cpuset_update_task_memory_state();
+
+ mutex_lock(&callback_mutex);
+ tsk->mems_allowed = *to;
+ mutex_unlock(&callback_mutex);
+
+ do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+ mutex_lock(&callback_mutex);
+ guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+ mutex_unlock(&callback_mutex);
+}
+
+static void *cpuset_being_rebound;
+
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+{
+ struct task_struct *p;
+ struct mm_struct **mmarray;
+ int i, n, ntasks;
+ int migrate;
+ int fudge;
+ struct cgroup_iter it;
+ int retval;
+
+ cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
+
+ fudge = 10; /* spare mmarray[] slots */
+ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+ retval = -ENOMEM;
+
+ /*
+ * Allocate mmarray[] to hold mm reference for each task
+ * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
+ * tasklist_lock. We could use GFP_ATOMIC, but with a
+ * few more lines of code, we can retry until we get a big
+ * enough mmarray[] w/o using GFP_ATOMIC.
+ */
+ while (1) {
+ ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
+ ntasks += fudge;
+ mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+ if (!mmarray)
+ goto done;
+ read_lock(&tasklist_lock); /* block fork */
+ if (cgroup_task_count(cs->css.cgroup) <= ntasks)
+ break; /* got enough */
+ read_unlock(&tasklist_lock); /* try again */
+ kfree(mmarray);
+ }
+
+ n = 0;
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ cgroup_iter_start(cs->css.cgroup, &it);
+ while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+ struct mm_struct *mm;
+
+ if (n >= ntasks) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ break;
+ }
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[n++] = mm;
+ }
+ cgroup_iter_end(cs->css.cgroup, &it);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Now that we've dropped the tasklist spinlock, we can
+ * rebind the vma mempolicies of each mm in mmarray[] to their
+ * new cpuset, and release that mm. The mpol_rebind_mm()
+ * call takes mmap_sem, which we couldn't take while holding
+ * tasklist_lock. Forks can happen again now - the mpol_dup()
+ * cpuset_being_rebound check will catch such forks, and rebind
+ * their vma mempolicies too. Because we still hold the global
+ * cgroup_mutex, we know that no other rebind effort will
+ * be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ migrate = is_memory_migrate(cs);
+ for (i = 0; i < n; i++) {
+ struct mm_struct *mm = mmarray[i];
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+ mmput(mm);
+ }
+
+ /* We're done rebinding vmas to this cpuset's new mems_allowed. */
+ kfree(mmarray);
+ cpuset_being_rebound = NULL;
+ retval = 0;
+done:
+ return retval;
+}
+
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+ struct cpuset trialcs;
+ nodemask_t oldmem;
+ int retval;
+
+ /*
+ * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * it's read-only
+ */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
+ trialcs = *cs;
+
+ /*
+ * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+ * Since nodelist_parse() fails on an empty mask, we special case
+ * that parsing. The validate_change() call ensures that cpusets
+ * with tasks have memory.
+ */
+ if (!*buf) {
+ nodes_clear(trialcs.mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ goto done;
+
+ if (!nodes_subset(trialcs.mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ return -EINVAL;
+ }
+ oldmem = cs->mems_allowed;
+ if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+ retval = 0; /* Too easy - nothing to do */
+ goto done;
+ }
+ retval = validate_change(cs, &trialcs);
+ if (retval < 0)
+ goto done;
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = trialcs.mems_allowed;
+ cs->mems_generation = cpuset_mems_generation++;
+ mutex_unlock(&callback_mutex);
+
+ retval = update_tasks_nodemask(cs, &oldmem);
+done:
+ return retval;
+}
+
+int current_cpuset_is_being_rebound(void)
+{
+ return task_cs(current) == cpuset_being_rebound;
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+ if (val < -1 || val >= SD_LV_MAX)
+ return -EINVAL;
+
+ if (val != cs->relax_domain_level) {
+ cs->relax_domain_level = val;
+ if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+ async_rebuild_sched_domains();
+ }
+
+ return 0;
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit: the bit to update (see cpuset_flagbits_t)
+ * cs: the cpuset to update
+ * turning_on: whether the flag is being set or cleared
+ *
+ * Call with cgroup_mutex held.
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on)
+{
+ struct cpuset trialcs;
+ int err;
+ int balance_flag_changed;
+
+ trialcs = *cs;
+ if (turning_on)
+ set_bit(bit, &trialcs.flags);
+ else
+ clear_bit(bit, &trialcs.flags);
+
+ err = validate_change(cs, &trialcs);
+ if (err < 0)
+ return err;
+
+ balance_flag_changed = (is_sched_load_balance(cs) !=
+ is_sched_load_balance(&trialcs));
+
+ mutex_lock(&callback_mutex);
+ cs->flags = trialcs.flags;
+ mutex_unlock(&callback_mutex);
+
+ if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
+ async_rebuild_sched_domains();
+
+ return 0;
+}
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time_t now = get_seconds();
+ time_t ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont, struct task_struct *tsk)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+ if (tsk->flags & PF_THREAD_BOUND) {
+ cpumask_t mask;
+
+ mutex_lock(&callback_mutex);
+ mask = cs->cpus_allowed;
+ mutex_unlock(&callback_mutex);
+ if (!cpus_equal(tsk->cpus_allowed, mask))
+ return -EINVAL;
+ }
+
+ return security_task_setscheduler(tsk, 0, NULL);
+}
+
+static void cpuset_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont, struct cgroup *oldcont,
+ struct task_struct *tsk)
+{
+ cpumask_t cpus;
+ nodemask_t from, to;
+ struct mm_struct *mm;
+ struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *oldcs = cgroup_cs(oldcont);
+ int err;
+
+ mutex_lock(&callback_mutex);
+ guarantee_online_cpus(cs, &cpus);
+ err = set_cpus_allowed_ptr(tsk, &cpus);
+ mutex_unlock(&callback_mutex);
+ if (err)
+ return;
+
+ from = oldcs->mems_allowed;
+ to = cs->mems_allowed;
+ mm = get_task_mm(tsk);
+ if (mm) {
+ mpol_rebind_mm(mm, &to);
+ if (is_memory_migrate(cs))
+ cpuset_migrate_mm(mm, &from, &to);
+ mmput(mm);
+ }
+
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ int retval = 0;
+ struct cpuset *cs = cgroup_cs(cgrp);
+ cpuset_filetype_t type = cft->private;
+
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_HARDWALL:
+ retval = update_flag(CS_MEM_HARDWALL, cs, val);
+ break;
+ case FILE_SCHED_LOAD_BALANCE:
+ retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+ break;
+ case FILE_MEMORY_MIGRATE:
+ retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
+ break;
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ cpuset_memory_pressure_enabled = !!val;
+ break;
+ case FILE_MEMORY_PRESSURE:
+ retval = -EACCES;
+ break;
+ case FILE_SPREAD_PAGE:
+ retval = update_flag(CS_SPREAD_PAGE, cs, val);
+ cs->mems_generation = cpuset_mems_generation++;
+ break;
+ case FILE_SPREAD_SLAB:
+ retval = update_flag(CS_SPREAD_SLAB, cs, val);
+ cs->mems_generation = cpuset_mems_generation++;
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+ cgroup_unlock();
+ return retval;
+}
+
+static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+{
+ int retval = 0;
+ struct cpuset *cs = cgroup_cs(cgrp);
+ cpuset_filetype_t type = cft->private;
+
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+ cgroup_unlock();
+ return retval;
+}
+
+/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf)
+{
+ int retval = 0;
+
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
+ switch (cft->private) {
+ case FILE_CPULIST:
+ retval = update_cpumask(cgroup_cs(cgrp), buf);
+ break;
+ case FILE_MEMLIST:
+ retval = update_nodemask(cgroup_cs(cgrp), buf);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+ cgroup_unlock();
+ return retval;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map. If read in smaller
+ * chunks, there is no guarantee of atomicity. Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ * A single large read to a buffer that crosses a page boundary is
+ * ok, because the result being copied to user land is not recomputed
+ * across a page fault.
+ */
+
+static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+{
+ cpumask_t mask;
+
+ mutex_lock(&callback_mutex);
+ mask = cs->cpus_allowed;
+ mutex_unlock(&callback_mutex);
+
+ return cpulist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+{
+ nodemask_t mask;
+
+ mutex_lock(&callback_mutex);
+ mask = cs->mems_allowed;
+ mutex_unlock(&callback_mutex);
+
+ return nodelist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static ssize_t cpuset_common_file_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct file *file,
+ char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+ cpuset_filetype_t type = cft->private;
+ char *page;
+ ssize_t retval = 0;
+ char *s;
+
+ if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
+ return -ENOMEM;
+
+ s = page;
+
+ switch (type) {
+ case FILE_CPULIST:
+ s += cpuset_sprintf_cpulist(s, cs);
+ break;
+ case FILE_MEMLIST:
+ s += cpuset_sprintf_memlist(s, cs);
+ break;
+ default:
+ retval = -EINVAL;
+ goto out;
+ }
+ *s++ = '\n';
+
+ retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
+out:
+ free_page((unsigned long)page);
+ return retval;
+}
+
+static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+ cpuset_filetype_t type = cft->private;
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ return is_cpu_exclusive(cs);
+ case FILE_MEM_EXCLUSIVE:
+ return is_mem_exclusive(cs);
+ case FILE_MEM_HARDWALL:
+ return is_mem_hardwall(cs);
+ case FILE_SCHED_LOAD_BALANCE:
+ return is_sched_load_balance(cs);
+ case FILE_MEMORY_MIGRATE:
+ return is_memory_migrate(cs);
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ return cpuset_memory_pressure_enabled;
+ case FILE_MEMORY_PRESSURE:
+ return fmeter_getrate(&cs->fmeter);
+ case FILE_SPREAD_PAGE:
+ return is_spread_page(cs);
+ case FILE_SPREAD_SLAB:
+ return is_spread_slab(cs);
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+ cpuset_filetype_t type = cft->private;
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+
+ /* Unrechable but makes gcc happy */
+ return 0;
+}
+
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype files[] = {
+ {
+ .name = "cpus",
+ .read = cpuset_common_file_read,
+ .write_string = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ },
+
+ {
+ .name = "mems",
+ .read = cpuset_common_file_read,
+ .write_string = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ },
+
+ {
+ .name = "cpu_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_CPU_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_hardwall",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_HARDWALL,
+ },
+
+ {
+ .name = "sched_load_balance",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_LOAD_BALANCE,
+ },
+
+ {
+ .name = "sched_relax_domain_level",
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
+ .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ },
+
+ {
+ .name = "memory_migrate",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_MIGRATE,
+ },
+
+ {
+ .name = "memory_pressure",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE,
+ },
+
+ {
+ .name = "memory_spread_page",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_PAGE,
+ },
+
+ {
+ .name = "memory_spread_slab",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_SLAB,
+ },
+};
+
+static struct cftype cft_memory_pressure_enabled = {
+ .name = "memory_pressure_enabled",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+};
+
+static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ int err;
+
+ err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+ if (err)
+ return err;
+ /* memory_pressure_enabled is in root cpuset only */
+ if (!cont->parent)
+ err = cgroup_add_file(cont, ss,
+ &cft_memory_pressure_enabled);
+ return err;
+}
+
+/*
+ * post_clone() is called at the end of cgroup_clone().
+ * 'cgroup' was just created automatically as a result of
+ * a cgroup_clone(), and the current task is about to
+ * be moved into 'cgroup'.
+ *
+ * Currently we refuse to set up the cgroup - thereby
+ * refusing the task to be entered, and as a result refusing
+ * the sys_unshare() or clone() which initiated it - if any
+ * sibling cpusets have exclusive cpus or mem.
+ *
+ * If this becomes a problem for some users who wish to
+ * allow that scenario, then cpuset_post_clone() could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
+ * held.
+ */
+static void cpuset_post_clone(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ struct cgroup *parent, *child;
+ struct cpuset *cs, *parent_cs;
+
+ parent = cgroup->parent;
+ list_for_each_entry(child, &parent->children, sibling) {
+ cs = cgroup_cs(child);
+ if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
+ return;
+ }
+ cs = cgroup_cs(cgroup);
+ parent_cs = cgroup_cs(parent);
+
+ cs->mems_allowed = parent_cs->mems_allowed;
+ cs->cpus_allowed = parent_cs->cpus_allowed;
+ return;
+}
+
+/*
+ * cpuset_create - create a cpuset
+ * ss: cpuset cgroup subsystem
+ * cont: control group that the new cpuset will be part of
+ */
+
+static struct cgroup_subsys_state *cpuset_create(
+ struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct cpuset *cs;
+ struct cpuset *parent;
+
+ if (!cont->parent) {
+ /* This is early initialization for the top cgroup */
+ top_cpuset.mems_generation = cpuset_mems_generation++;
+ return &top_cpuset.css;
+ }
+ parent = cgroup_cs(cont->parent);
+ cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ cpuset_update_task_memory_state();
+ cs->flags = 0;
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ cpus_clear(cs->cpus_allowed);
+ nodes_clear(cs->mems_allowed);
+ cs->mems_generation = cpuset_mems_generation++;
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+
+ cs->parent = parent;
+ number_of_cpusets++;
+ return &cs->css ;
+}
+
+/*
+ * If the cpuset being removed has its flag 'sched_load_balance'
+ * enabled, then simulate turning sched_load_balance off, which
+ * will call async_rebuild_sched_domains().
+ */
+
+static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ cpuset_update_task_memory_state();
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ number_of_cpusets--;
+ kfree(cs);
+}
+
+struct cgroup_subsys cpuset_subsys = {
+ .name = "cpuset",
+ .create = cpuset_create,
+ .destroy = cpuset_destroy,
+ .can_attach = cpuset_can_attach,
+ .attach = cpuset_attach,
+ .populate = cpuset_populate,
+ .post_clone = cpuset_post_clone,
+ .subsys_id = cpuset_subsys_id,
+ .early_init = 1,
+};
+
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+ top_cpuset.mems_generation = cpuset_mems_generation++;
+ return 0;
+}
+
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+ int err = 0;
+
+ cpus_setall(top_cpuset.cpus_allowed);
+ nodes_setall(top_cpuset.mems_allowed);
+
+ fmeter_init(&top_cpuset.fmeter);
+ top_cpuset.mems_generation = cpuset_mems_generation++;
+ set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
+ top_cpuset.relax_domain_level = -1;
+
+ err = register_filesystem(&cpuset_fs_type);
+ if (err < 0)
+ return err;
+
+ number_of_cpusets = 1;
+ return 0;
+}
+
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+static void cpuset_do_move_task(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ struct cpuset_hotplug_scanner *chsp;
+
+ chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+ cgroup_attach_task(chsp->to, tsk);
+}
+
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+ struct cpuset_hotplug_scanner scan;
+
+ scan.scan.cg = from->css.cgroup;
+ scan.scan.test_task = NULL; /* select all tasks in cgroup */
+ scan.scan.process_task = cpuset_do_move_task;
+ scan.scan.heap = NULL;
+ scan.to = to->css.cgroup;
+
+ if (cgroup_scan_tasks(&scan.scan))
+ printk(KERN_ERR "move_member_tasks_to_cpuset: "
+ "cgroup_scan_tasks failed\n");
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * The cgroup's css_sets list is in use if there are tasks
+ * in the cpuset; the list is empty if there are none;
+ * the cs->css.refcnt seems always 0.
+ */
+ if (list_empty(&cs->css.cgroup->css_sets))
+ return;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = cs->parent;
+ while (cpus_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent->parent;
+
+ move_member_tasks_to_cpuset(cs, parent);
+}
+
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
+ *
+ * Called with cgroup_mutex held. We take callback_mutex to modify
+ * cpus_allowed and mems_allowed.
+ *
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next. It always processes a node before
+ * any of its children.
+ *
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'. But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ */
+static void scan_for_empty_cpusets(struct cpuset *root)
+{
+ LIST_HEAD(queue);
+ struct cpuset *cp; /* scans cpusets being updated */
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct cgroup *cont;
+ nodemask_t oldmems;
+
+ list_add_tail((struct list_head *)&root->stack_list, &queue);
+
+ while (!list_empty(&queue)) {
+ cp = list_first_entry(&queue, struct cpuset, stack_list);
+ list_del(queue.next);
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, &queue);
+ }
+
+ /* Continue past cpusets with all cpus, mems online */
+ if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+ nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+ continue;
+
+ oldmems = cp->mems_allowed;
+
+ /* Remove offline cpus and mems from this cpuset. */
+ mutex_lock(&callback_mutex);
+ cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+ nodes_and(cp->mems_allowed, cp->mems_allowed,
+ node_states[N_HIGH_MEMORY]);
+ mutex_unlock(&callback_mutex);
+
+ /* Move tasks from the empty cpuset to a parent */
+ if (cpus_empty(cp->cpus_allowed) ||
+ nodes_empty(cp->mems_allowed))
+ remove_tasks_in_empty_cpuset(cp);
+ else {
+ update_tasks_cpumask(cp, NULL);
+ update_tasks_nodemask(cp, &oldmems);
+ }
+ }
+}
+
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus(). Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
+ */
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
+ unsigned long phase, void *unused_cpu)
+{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
+ switch (phase) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ cgroup_lock();
+ top_cpuset.cpus_allowed = cpu_online_map;
+ scan_for_empty_cpusets(&top_cpuset);
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+
+ return NOTIFY_OK;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
+ * See also the previous routine cpuset_track_online_cpus().
+ */
+static int cpuset_track_online_nodes(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ cgroup_lock();
+ switch (action) {
+ case MEM_ONLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ break;
+ case MEM_OFFLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
+ break;
+ default:
+ break;
+ }
+ cgroup_unlock();
+ return NOTIFY_OK;
+}
+#endif
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ **/
+
+void __init cpuset_init_smp(void)
+{
+ top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+
+ hotcpu_notifier(cpuset_track_online_cpus, 0);
+ hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of cpu_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
+{
+ mutex_lock(&callback_mutex);
+ cpuset_cpus_allowed_locked(tsk, pmask);
+ mutex_unlock(&callback_mutex);
+}
+
+/**
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
+ * Must be called with callback_mutex held.
+ **/
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
+{
+ task_lock(tsk);
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ task_unlock(tsk);
+}
+
+void cpuset_init_current_mems_allowed(void)
+{
+ nodes_setall(current->mems_allowed);
+}
+
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+ nodemask_t mask;
+
+ mutex_lock(&callback_mutex);
+ task_lock(tsk);
+ guarantee_online_mems(task_cs(tsk), &mask);
+ task_unlock(tsk);
+ mutex_unlock(&callback_mutex);
+
+ return mask;
+}
+
+/**
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * @nodemask: the nodemask to be checked
+ *
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
+ */
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
+{
+ return nodes_intersects(*nodemask, current->mems_allowed);
+}
+
+/*
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset. Call holding
+ * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
+ */
+static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+{
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
+ cs = cs->parent;
+ return cs;
+}
+
+/**
+ * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate. If
+ * __GFP_THISNODE is set, yes, we can always allocate. If zone
+ * z's node is in our tasks mems_allowed, yes. If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * hardwalled cpuset ancestor to this tasks cpuset, yes.
+ * If the task has been OOM killed and has access to memory reserves
+ * as specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
+ *
+ * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
+ * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * from an enclosing cpuset.
+ *
+ * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * hardwall cpusets, and never sleeps.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first. By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest enclosing hardwalled ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires callback_mutex. The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_mutex
+ * mutex.
+ *
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags. That logic and the checks below have the combined
+ * affect that:
+ * in_interrupt - any node ok (current task context irrelevant)
+ * GFP_ATOMIC - any node ok
+ * TIF_MEMDIE - any node ok
+ * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
+ * GFP_USER - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ * the code that might scan up ancestor cpusets and sleep.
+ */
+
+int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+ int node; /* node that zone z is on */
+ const struct cpuset *cs; /* current cpuset ancestors */
+ int allowed; /* is allocation in zone z allowed? */
+
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ return 1;
+ node = zone_to_nid(z);
+ might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return 0;
+
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return 1;
+
+ /* Not hardwall and node outside mems_allowed: scan up cpusets */
+ mutex_lock(&callback_mutex);
+
+ task_lock(current);
+ cs = nearest_hardwall_ancestor(task_cs(current));
+ task_unlock(current);
+
+ allowed = node_isset(node, cs->mems_allowed);
+ mutex_unlock(&callback_mutex);
+ return allowed;
+}
+
+/*
+ * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.
+ * If __GFP_THISNODE is set, yes, we can always allocate. If zone
+ * z's node is in our tasks mems_allowed, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the
+ * TIF_MEMDIE flag, yes. Otherwise, no.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first. By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * this variant requires that the zone be in the current tasks
+ * mems_allowed or that we're in interrupt. It does not scan up the
+ * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
+ * It never sleeps.
+ */
+
+int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+ int node; /* node that zone z is on */
+
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ return 1;
+ node = zone_to_nid(z);
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
+ return 0;
+}
+
+/**
+ * cpuset_lock - lock out any changes to cpuset structures
+ *
+ * The out of memory (oom) code needs to mutex_lock cpusets
+ * from being changed while it scans the tasklist looking for a
+ * task in an overlapping cpuset. Expose callback_mutex via this
+ * cpuset_lock() routine, so the oom code can lock it, before
+ * locking the task list. The tasklist_lock is a spinlock, so
+ * must be taken inside callback_mutex.
+ */
+
+void cpuset_lock(void)
+{
+ mutex_lock(&callback_mutex);
+}
+
+/**
+ * cpuset_unlock - release lock on cpuset changes
+ *
+ * Undo the lock taken in a previous cpuset_lock() call.
+ */
+
+void cpuset_unlock(void)
+{
+ mutex_unlock(&callback_mutex);
+}
+
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online. So it
+ * should not be possible for the following code to return an
+ * offline node. But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start. The zonelist passed to
+ * __alloc_pages() will include all nodes. If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+int cpuset_mem_spread_node(void)
+{
+ int node;
+
+ node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ if (node == MAX_NUMNODES)
+ node = first_node(current->mems_allowed);
+ current->cpuset_mem_spread_rotor = node;
+ return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
+/**
+ * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
+ * @tsk1: pointer to task_struct of some task.
+ * @tsk2: pointer to task_struct of some other task.
+ *
+ * Description: Return true if @tsk1's mems_allowed intersects the
+ * mems_allowed of @tsk2. Used by the OOM killer to determine if
+ * one of the task's memory usage might impact the memory available
+ * to the other.
+ **/
+
+int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
+ const struct task_struct *tsk2)
+{
+ return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure". Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+ task_lock(current);
+ fmeter_markevent(&task_cs(current)->fmeter);
+ task_unlock(current);
+}
+
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ * doesn't really matter if tsk->cpuset changes after we read it,
+ * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * anyway.
+ */
+static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+{
+ struct pid *pid;
+ struct task_struct *tsk;
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ retval = -ESRCH;
+ pid = m->private;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (!tsk)
+ goto out_free;
+
+ retval = -EINVAL;
+ cgroup_lock();
+ css = task_subsys_state(tsk, cpuset_subsys_id);
+ retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ if (retval < 0)
+ goto out_unlock;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+out_unlock:
+ cgroup_unlock();
+ put_task_struct(tsk);
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cpuset_show, pid);
+}
+
+const struct file_operations proc_cpuset_operations = {
+ .open = cpuset_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_PID_CPUSET */
+
+/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+ seq_printf(m, "Cpus_allowed:\t");
+ seq_cpumask(m, &task->cpus_allowed);
+ seq_printf(m, "\n");
+ seq_printf(m, "Cpus_allowed_list:\t");
+ seq_cpumask_list(m, &task->cpus_allowed);
+ seq_printf(m, "\n");
+ seq_printf(m, "Mems_allowed:\t");
+ seq_nodemask(m, &task->mems_allowed);
+ seq_printf(m, "\n");
+ seq_printf(m, "Mems_allowed_list:\t");
+ seq_nodemask_list(m, &task->mems_allowed);
+ seq_printf(m, "\n");
+}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 0000000..b3179da
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,183 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+struct kmem_cache *delayacct_cache;
+
+static int __init delayacct_setup_disable(char *str)
+{
+ delayacct_on = 0;
+ return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+
+void delayacct_init(void)
+{
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
+ if (tsk->delays)
+ spin_lock_init(&tsk->delays->lock);
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+ do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+ u64 *total, u32 *count)
+{
+ struct timespec ts;
+ s64 ns;
+ unsigned long flags;
+
+ do_posix_clock_monotonic_gettime(end);
+ ts = timespec_sub(*end, *start);
+ ns = timespec_to_ns(&ts);
+ if (ns < 0)
+ return;
+
+ spin_lock_irqsave(&current->delays->lock, flags);
+ *total += ns;
+ (*count)++;
+ spin_unlock_irqrestore(&current->delays->lock, flags);
+}
+
+void __delayacct_blkio_start(void)
+{
+ delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+ if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+ /* Swapin block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+ else /* Other block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->blkio_delay,
+ &current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+ s64 tmp;
+ unsigned long t1;
+ unsigned long long t2, t3;
+ unsigned long flags;
+ struct timespec ts;
+
+ /* Though tsk->delays accessed later, early exit avoids
+ * unnecessary returning of other data
+ */
+ if (!tsk->delays)
+ goto done;
+
+ tmp = (s64)d->cpu_run_real_total;
+ cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+ tmp = (s64)d->cpu_scaled_run_real_total;
+ cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_scaled_run_real_total =
+ (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
+
+ /*
+ * No locking available for sched_info (and too expensive to add one)
+ * Mitigate by taking snapshot of values
+ */
+ t1 = tsk->sched_info.pcount;
+ t2 = tsk->sched_info.run_delay;
+ t3 = tsk->sched_info.cpu_time;
+
+ d->cpu_count += t1;
+
+ tmp = (s64)d->cpu_delay_total + t2;
+ d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+ tmp = (s64)d->cpu_run_virtual_total + t3;
+ d->cpu_run_virtual_total =
+ (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+
+ /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+ spin_lock_irqsave(&tsk->delays->lock, flags);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+ d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ d->freepages_count += tsk->delays->freepages_count;
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
+
+done:
+ return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+ __u64 ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->delays->lock, flags);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+ tsk->delays->swapin_delay);
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ return ret;
+}
+
+void __delayacct_freepages_start(void)
+{
+ delayacct_start(&current->delays->freepages_start);
+}
+
+void __delayacct_freepages_end(void)
+{
+ delayacct_end(&current->delays->freepages_start,
+ &current->delays->freepages_end,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
+}
+
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 0000000..f013a0c
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,155 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+
+struct dma_coherent_mem {
+ void *virt_base;
+ u32 device_base;
+ int size;
+ int flags;
+ unsigned long *bitmap;
+};
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+{
+ void __iomem *mem_base = NULL;
+ int pages = size >> PAGE_SHIFT;
+ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+ goto out;
+ if (!size)
+ goto out;
+ if (dev->dma_mem)
+ goto out;
+
+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+ mem_base = ioremap(bus_addr, size);
+ if (!mem_base)
+ goto out;
+
+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ if (!dev->dma_mem)
+ goto out;
+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dev->dma_mem->bitmap)
+ goto free1_out;
+
+ dev->dma_mem->virt_base = mem_base;
+ dev->dma_mem->device_base = device_addr;
+ dev->dma_mem->size = pages;
+ dev->dma_mem->flags = flags;
+
+ if (flags & DMA_MEMORY_MAP)
+ return DMA_MEMORY_MAP;
+
+ return DMA_MEMORY_IO;
+
+ free1_out:
+ kfree(dev->dma_mem);
+ out:
+ if (mem_base)
+ iounmap(mem_base);
+ return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+
+ if (!mem)
+ return;
+ dev->dma_mem = NULL;
+ iounmap(mem->virt_base);
+ kfree(mem->bitmap);
+ kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+ dma_addr_t device_addr, size_t size)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+ int pos, err;
+
+ size += device_addr & ~PAGE_MASK;
+
+ if (!mem)
+ return ERR_PTR(-EINVAL);
+
+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
+ if (err != 0)
+ return ERR_PTR(err);
+ return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+/**
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
+ *
+ * @dev: device from which we allocate memory
+ * @size: size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret: This pointer will be filled with the virtual address
+ * to allocated area.
+ *
+ * This function should be only called from per-arch dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
+ */
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle, void **ret)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+ int order = get_order(size);
+
+ if (mem) {
+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
+ order);
+ if (page >= 0) {
+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+ *ret = mem->virt_base + (page << PAGE_SHIFT);
+ memset(*ret, 0, size);
+ } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+ *ret = NULL;
+ }
+ return (mem != NULL);
+}
+EXPORT_SYMBOL(dma_alloc_from_coherent);
+
+/**
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
+ * @dev: device from which the memory was allocated
+ * @order: the order of pages allocated
+ * @vaddr: virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+ if (mem && vaddr >= mem->virt_base && vaddr <
+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+ bitmap_release_region(mem->bitmap, page, order);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 0000000..f903189
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,161 @@
+/*
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ * In the previous version the reported device could end up being wrong,
+ * if a device requested a DMA channel that was already in use.
+ * [It also happened to remove the sizeof(char *) == sizeof(int)
+ * assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/dma.h>
+#include <asm/system.h>
+
+
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+DEFINE_SPINLOCK(dma_spin_lock);
+
+/*
+ * If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+ int lock;
+ const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+ [4] = { 1, "cascade" },
+};
+
+
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+ if (dmanr >= MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+ return -EBUSY;
+
+ dma_chan_busy[dmanr].device_id = device_id;
+
+ /* old flag was 0, now contains 1 to indicate busy */
+ return 0;
+} /* request_dma */
+
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
+void free_dma(unsigned int dmanr)
+{
+ if (dmanr >= MAX_DMA_CHANNELS) {
+ printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
+ return;
+ }
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+ printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
+ return;
+ }
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+ return -EINVAL;
+}
+
+void free_dma(unsigned int dmanr)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef MAX_DMA_CHANNELS
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+ if (dma_chan_busy[i].lock) {
+ seq_printf(m, "%2d: %s\n", i,
+ dma_chan_busy[i].device_id);
+ }
+ }
+ return 0;
+}
+#else
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "No DMA\n");
+ return 0;
+}
+#endif /* MAX_DMA_CHANNELS */
+
+static int proc_dma_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_dma_show, NULL);
+}
+
+static const struct file_operations proc_dma_operations = {
+ .open = proc_dma_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_dma_init(void)
+{
+ proc_create("dma", 0, NULL, &proc_dma_operations);
+ return 0;
+}
+
+__initcall(proc_dma_init);
+#endif
+
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 0000000..667c841
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,228 @@
+/*
+ * Handling of different ABIs (personalities).
+ *
+ * We group personalities into execution domains which have their
+ * own handlers for kernel entry points, signal mapping, etc...
+ *
+ * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+
+
+static void default_handler(int, struct pt_regs *);
+
+static struct exec_domain *exec_domains = &default_exec_domain;
+static DEFINE_RWLOCK(exec_domains_lock);
+
+
+static u_long ident_map[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+};
+
+struct exec_domain default_exec_domain = {
+ .name = "Linux", /* name */
+ .handler = default_handler, /* lcall7 causes a seg fault. */
+ .pers_low = 0, /* PER_LINUX personality. */
+ .pers_high = 0, /* PER_LINUX personality. */
+ .signal_map = ident_map, /* Identity map signals. */
+ .signal_invmap = ident_map, /* - both ways. */
+};
+
+
+static void
+default_handler(int segment, struct pt_regs *regp)
+{
+ set_personality(0);
+
+ if (current_thread_info()->exec_domain->handler != default_handler)
+ current_thread_info()->exec_domain->handler(segment, regp);
+ else
+ send_sig(SIGSEGV, current, 1);
+}
+
+static struct exec_domain *
+lookup_exec_domain(u_long personality)
+{
+ struct exec_domain * ep;
+ u_long pers = personality(personality);
+
+ read_lock(&exec_domains_lock);
+ for (ep = exec_domains; ep; ep = ep->next) {
+ if (pers >= ep->pers_low && pers <= ep->pers_high)
+ if (try_module_get(ep->module))
+ goto out;
+ }
+
+#ifdef CONFIG_MODULES
+ read_unlock(&exec_domains_lock);
+ request_module("personality-%ld", pers);
+ read_lock(&exec_domains_lock);
+
+ for (ep = exec_domains; ep; ep = ep->next) {
+ if (pers >= ep->pers_low && pers <= ep->pers_high)
+ if (try_module_get(ep->module))
+ goto out;
+ }
+#endif
+
+ ep = &default_exec_domain;
+out:
+ read_unlock(&exec_domains_lock);
+ return (ep);
+}
+
+int
+register_exec_domain(struct exec_domain *ep)
+{
+ struct exec_domain *tmp;
+ int err = -EBUSY;
+
+ if (ep == NULL)
+ return -EINVAL;
+
+ if (ep->next != NULL)
+ return -EBUSY;
+
+ write_lock(&exec_domains_lock);
+ for (tmp = exec_domains; tmp; tmp = tmp->next) {
+ if (tmp == ep)
+ goto out;
+ }
+
+ ep->next = exec_domains;
+ exec_domains = ep;
+ err = 0;
+
+out:
+ write_unlock(&exec_domains_lock);
+ return (err);
+}
+
+int
+unregister_exec_domain(struct exec_domain *ep)
+{
+ struct exec_domain **epp;
+
+ epp = &exec_domains;
+ write_lock(&exec_domains_lock);
+ for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
+ if (ep == *epp)
+ goto unregister;
+ }
+ write_unlock(&exec_domains_lock);
+ return -EINVAL;
+
+unregister:
+ *epp = ep->next;
+ ep->next = NULL;
+ write_unlock(&exec_domains_lock);
+ return 0;
+}
+
+int
+__set_personality(u_long personality)
+{
+ struct exec_domain *ep, *oep;
+
+ ep = lookup_exec_domain(personality);
+ if (ep == current_thread_info()->exec_domain) {
+ current->personality = personality;
+ module_put(ep->module);
+ return 0;
+ }
+
+ if (atomic_read(&current->fs->count) != 1) {
+ struct fs_struct *fsp, *ofsp;
+
+ fsp = copy_fs_struct(current->fs);
+ if (fsp == NULL) {
+ module_put(ep->module);
+ return -ENOMEM;
+ }
+
+ task_lock(current);
+ ofsp = current->fs;
+ current->fs = fsp;
+ task_unlock(current);
+
+ put_fs_struct(ofsp);
+ }
+
+ /*
+ * At that point we are guaranteed to be the sole owner of
+ * current->fs.
+ */
+
+ current->personality = personality;
+ oep = current_thread_info()->exec_domain;
+ current_thread_info()->exec_domain = ep;
+
+ module_put(oep->module);
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static int execdomains_proc_show(struct seq_file *m, void *v)
+{
+ struct exec_domain *ep;
+
+ read_lock(&exec_domains_lock);
+ for (ep = exec_domains; ep; ep = ep->next)
+ seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
+ ep->pers_low, ep->pers_high, ep->name,
+ module_name(ep->module));
+ read_unlock(&exec_domains_lock);
+ return 0;
+}
+
+static int execdomains_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, execdomains_proc_show, NULL);
+}
+
+static const struct file_operations execdomains_proc_fops = {
+ .open = execdomains_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_execdomains_init(void)
+{
+ proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ return 0;
+}
+module_init(proc_execdomains_init);
+#endif
+
+SYSCALL_DEFINE1(personality, u_long, personality)
+{
+ u_long old = current->personality;
+
+ if (personality != 0xffffffff) {
+ set_personality(personality);
+ if (current->personality != personality)
+ return -EINVAL;
+ }
+
+ return (long)old;
+}
+
+
+EXPORT_SYMBOL(register_exec_domain);
+EXPORT_SYMBOL(unregister_exec_domain);
+EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 0000000..10e393b
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,1841 @@
+/*
+ * linux/kernel/exit.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/mnt_namespace.h>
+#include <linux/iocontext.h>
+#include <linux/key.h>
+#include <linux/security.h>
+#include <linux/cpu.h>
+#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/binfmts.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/kthread.h>
+#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/freezer.h>
+#include <linux/cgroup.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/posix-timers.h>
+#include <linux/cn_proc.h>
+#include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
+#include <linux/blkdev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
+#include <trace/sched.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+static void exit_mm(struct task_struct * tsk);
+
+static inline int task_detached(struct task_struct *p)
+{
+ return p->exit_signal == -1;
+}
+
+static void __unhash_process(struct task_struct *p)
+{
+ nr_threads--;
+ detach_pid(p, PIDTYPE_PID);
+ if (thread_group_leader(p)) {
+ detach_pid(p, PIDTYPE_PGID);
+ detach_pid(p, PIDTYPE_SID);
+
+ list_del_rcu(&p->tasks);
+ __get_cpu_var(process_counts)--;
+ }
+ list_del_rcu(&p->thread_group);
+ list_del_init(&p->sibling);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct sighand_struct *sighand;
+
+ BUG_ON(!sig);
+ BUG_ON(!atomic_read(&sig->count));
+
+ sighand = rcu_dereference(tsk->sighand);
+ spin_lock(&sighand->siglock);
+
+ posix_cpu_timers_exit(tsk);
+ if (atomic_dec_and_test(&sig->count))
+ posix_cpu_timers_exit_group(tsk);
+ else {
+ /*
+ * If there is any task waiting for the group exit
+ * then notify it:
+ */
+ if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+ wake_up_process(sig->group_exit_task);
+
+ if (tsk == sig->curr_target)
+ sig->curr_target = next_thread(tsk);
+ /*
+ * Accumulate here the counters for all threads but the
+ * group leader as they die, so they can be added into
+ * the process-wide totals when those are taken.
+ * The group leader stays around as a zombie as long
+ * as there are other threads. When it gets reaped,
+ * the exit.c code will add its counts into these totals.
+ * We won't ever get here for the group leader, since it
+ * will have been the last reference on the signal_struct.
+ */
+ sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->inblock += task_io_get_inblock(tsk);
+ sig->oublock += task_io_get_oublock(tsk);
+ task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig = NULL; /* Marker for below. */
+ }
+
+ __unhash_process(tsk);
+
+ /*
+ * Do this under ->siglock, we can race with another thread
+ * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
+ */
+ flush_sigqueue(&tsk->pending);
+
+ tsk->signal = NULL;
+ tsk->sighand = NULL;
+ spin_unlock(&sighand->siglock);
+
+ __cleanup_sighand(sighand);
+ clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ if (sig) {
+ flush_sigqueue(&sig->shared_pending);
+ taskstats_tgid_free(sig);
+ /*
+ * Make sure ->signal can't go away under rq->lock,
+ * see account_group_exec_runtime().
+ */
+ task_rq_unlock_wait(tsk);
+ __cleanup_signal(sig);
+ }
+}
+
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ trace_sched_process_free(tsk);
+ put_task_struct(tsk);
+}
+
+
+void release_task(struct task_struct * p)
+{
+ struct task_struct *leader;
+ int zap_leader;
+repeat:
+ tracehook_prepare_release_task(p);
+ atomic_dec(&p->user->processes);
+ proc_flush_task(p);
+ write_lock_irq(&tasklist_lock);
+ tracehook_finish_release_task(p);
+ __exit_signal(p);
+
+ /*
+ * If we are the last non-leader member of the thread
+ * group, and the leader is zombie, then notify the
+ * group leader's parent process. (if it wants notification.)
+ */
+ zap_leader = 0;
+ leader = p->group_leader;
+ if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ BUG_ON(task_detached(leader));
+ do_notify_parent(leader, leader->exit_signal);
+ /*
+ * If we were the last child thread and the leader has
+ * exited already, and the leader's parent ignores SIGCHLD,
+ * then we are the one who should release the leader.
+ *
+ * do_notify_parent() will have marked it self-reaping in
+ * that case.
+ */
+ zap_leader = task_detached(leader);
+
+ /*
+ * This maintains the invariant that release_task()
+ * only runs on a task in EXIT_DEAD, just for sanity.
+ */
+ if (zap_leader)
+ leader->exit_state = EXIT_DEAD;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ release_thread(p);
+ call_rcu(&p->rcu, delayed_put_task_struct);
+
+ p = leader;
+ if (unlikely(zap_leader))
+ goto repeat;
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ *
+ * The caller must hold rcu lock or the tasklist lock.
+ */
+struct pid *session_of_pgrp(struct pid *pgrp)
+{
+ struct task_struct *p;
+ struct pid *sid = NULL;
+
+ p = pid_task(pgrp, PIDTYPE_PGID);
+ if (p == NULL)
+ p = pid_task(pgrp, PIDTYPE_PID);
+ if (p != NULL)
+ sid = task_session(p);
+
+ return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52. Orphaned process groups are not to be affected
+ * by terminal-generated stop signals. Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+{
+ struct task_struct *p;
+
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ if ((p == ignored_task) ||
+ (p->exit_state && thread_group_empty(p)) ||
+ is_global_init(p->real_parent))
+ continue;
+
+ if (task_pgrp(p->real_parent) != pgrp &&
+ task_session(p->real_parent) == task_session(p))
+ return 0;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+
+ return 1;
+}
+
+int is_current_pgrp_orphaned(void)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+static int has_stopped_jobs(struct pid *pgrp)
+{
+ int retval = 0;
+ struct task_struct *p;
+
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ if (!task_is_stopped(p))
+ continue;
+ retval = 1;
+ break;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ return retval;
+}
+
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+ struct pid *pgrp = task_pgrp(tsk);
+ struct task_struct *ignored_task = tsk;
+
+ if (!parent)
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
+ parent = tsk->real_parent;
+ else
+ /* reparent: our child is in a different pgrp than
+ * we are, and it was the only connection outside.
+ */
+ ignored_task = NULL;
+
+ if (task_pgrp(parent) != pgrp &&
+ task_session(parent) == task_session(tsk) &&
+ will_become_orphaned_pgrp(pgrp, ignored_task) &&
+ has_stopped_jobs(pgrp)) {
+ __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+ __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+ }
+}
+
+/**
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
+ */
+static void reparent_to_kthreadd(void)
+{
+ write_lock_irq(&tasklist_lock);
+
+ ptrace_unlink(current);
+ /* Reparent to init */
+ current->real_parent = current->parent = kthreadd_task;
+ list_move_tail(&current->sibling, &current->real_parent->children);
+
+ /* Set the exit signal to SIGCHLD so we signal init on exit */
+ current->exit_signal = SIGCHLD;
+
+ if (task_nice(current) < 0)
+ set_user_nice(current, 0);
+ /* cpus_allowed? */
+ /* rt_priority? */
+ /* signals? */
+ security_task_reparent_to_init(current);
+ memcpy(current->signal->rlim, init_task.signal->rlim,
+ sizeof(current->signal->rlim));
+ atomic_inc(&(INIT_USER->__count));
+ write_unlock_irq(&tasklist_lock);
+ switch_uid(INIT_USER);
+}
+
+void __set_special_pids(struct pid *pid)
+{
+ struct task_struct *curr = current->group_leader;
+ pid_t nr = pid_nr(pid);
+
+ if (task_session(curr) != pid) {
+ change_pid(curr, PIDTYPE_SID, pid);
+ set_task_session(curr, nr);
+ }
+ if (task_pgrp(curr) != pid) {
+ change_pid(curr, PIDTYPE_PGID, pid);
+ set_task_pgrp(curr, nr);
+ }
+}
+
+static void set_special_pids(struct pid *pid)
+{
+ write_lock_irq(&tasklist_lock);
+ __set_special_pids(pid);
+ write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+ if (!valid_signal(sig) || sig < 1)
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigdelset(&current->blocked, sig);
+ if (!current->mm) {
+ /* Kernel threads handle their own signals.
+ Let the signal code know it'll be handled, so
+ that they don't get converted to SIGKILL or
+ just silently dropped */
+ current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
+ }
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+ if (!valid_signal(sig) || sig < 1)
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ * Put all the gunge required to become a kernel thread without
+ * attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+ va_list args;
+ struct fs_struct *fs;
+ sigset_t blocked;
+
+ va_start(args, name);
+ vsnprintf(current->comm, sizeof(current->comm), name, args);
+ va_end(args);
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+ /*
+ * We don't want to have TIF_FREEZE set if the system-wide hibernation
+ * or suspend transition begins right now.
+ */
+ current->flags |= (PF_NOFREEZE | PF_KTHREAD);
+
+ if (current->nsproxy != &init_nsproxy) {
+ get_nsproxy(&init_nsproxy);
+ switch_task_namespaces(current, &init_nsproxy);
+ }
+ set_special_pids(&init_struct_pid);
+ proc_clear_tty(current);
+
+ /* Block and flush all signals */
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ /* Become as one with the init task */
+
+ exit_fs(current); /* current->fs->count--; */
+ fs = init_task.fs;
+ current->fs = fs;
+ atomic_inc(&fs->count);
+
+ exit_files(current);
+ current->files = init_task.files;
+ atomic_inc(&current->files->count);
+
+ reparent_to_kthreadd();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static void close_files(struct files_struct * files)
+{
+ int i, j;
+ struct fdtable *fdt;
+
+ j = 0;
+
+ /*
+ * It is safe to dereference the fd table without RCU or
+ * ->file_lock because this is the last reference to the
+ * files structure.
+ */
+ fdt = files_fdtable(files);
+ for (;;) {
+ unsigned long set;
+ i = j * __NFDBITS;
+ if (i >= fdt->max_fds)
+ break;
+ set = fdt->open_fds->fds_bits[j++];
+ while (set) {
+ if (set & 1) {
+ struct file * file = xchg(&fdt->fd[i], NULL);
+ if (file) {
+ filp_close(file, files);
+ cond_resched();
+ }
+ }
+ i++;
+ set >>= 1;
+ }
+ }
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+ struct files_struct *files;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ atomic_inc(&files->count);
+ task_unlock(task);
+
+ return files;
+}
+
+void put_files_struct(struct files_struct *files)
+{
+ struct fdtable *fdt;
+
+ if (atomic_dec_and_test(&files->count)) {
+ close_files(files);
+ /*
+ * Free the fd and fdset arrays if we expanded them.
+ * If the fdtable was embedded, pass files for freeing
+ * at the end of the RCU grace period. Otherwise,
+ * you can free files immediately.
+ */
+ fdt = files_fdtable(files);
+ if (fdt != &files->fdtab)
+ kmem_cache_free(files_cachep, files);
+ free_fdtable(fdt);
+ }
+}
+
+void reset_files_struct(struct files_struct *files)
+{
+ struct task_struct *tsk = current;
+ struct files_struct *old;
+
+ old = tsk->files;
+ task_lock(tsk);
+ tsk->files = files;
+ task_unlock(tsk);
+ put_files_struct(old);
+}
+
+void exit_files(struct task_struct *tsk)
+{
+ struct files_struct * files = tsk->files;
+
+ if (files) {
+ task_lock(tsk);
+ tsk->files = NULL;
+ task_unlock(tsk);
+ put_files_struct(files);
+ }
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
+ path_put(&fs->root);
+ path_put(&fs->pwd);
+ kmem_cache_free(fs_cachep, fs);
+ }
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+ struct fs_struct * fs = tsk->fs;
+
+ if (fs) {
+ task_lock(tsk);
+ tsk->fs = NULL;
+ task_unlock(tsk);
+ put_fs_struct(fs);
+ }
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+ /*
+ * If there are other users of the mm and the owner (us) is exiting
+ * we need to find a new owner to take on the responsibility.
+ */
+ if (atomic_read(&mm->mm_users) <= 1)
+ return 0;
+ if (mm->owner != p)
+ return 0;
+ return 1;
+}
+
+void mm_update_next_owner(struct mm_struct *mm)
+{
+ struct task_struct *c, *g, *p = current;
+
+retry:
+ if (!mm_need_new_owner(mm, p))
+ return;
+
+ read_lock(&tasklist_lock);
+ /*
+ * Search in the children
+ */
+ list_for_each_entry(c, &p->children, sibling) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ }
+
+ /*
+ * Search in the siblings
+ */
+ list_for_each_entry(c, &p->parent->children, sibling) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ }
+
+ /*
+ * Search through everything else. We should not get
+ * here often
+ */
+ do_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ } while_each_thread(g, c);
+
+ read_unlock(&tasklist_lock);
+ /*
+ * We found no owner yet mm_users > 1: this implies that we are
+ * most likely racing with swapoff (try_to_unuse()) or /proc or
+ * ptrace or page migration (get_task_mm()). Mark owner as NULL,
+ * so that subsystems can understand the callback and take action.
+ */
+ down_write(&mm->mmap_sem);
+ cgroup_mm_owner_callbacks(mm->owner, NULL);
+ mm->owner = NULL;
+ up_write(&mm->mmap_sem);
+ return;
+
+assign_new_owner:
+ BUG_ON(c == p);
+ get_task_struct(c);
+ read_unlock(&tasklist_lock);
+ down_write(&mm->mmap_sem);
+ /*
+ * The task_lock protects c->mm from changing.
+ * We always want mm->owner->mm == mm
+ */
+ task_lock(c);
+ if (c->mm != mm) {
+ task_unlock(c);
+ up_write(&mm->mmap_sem);
+ put_task_struct(c);
+ goto retry;
+ }
+ cgroup_mm_owner_callbacks(mm->owner, c);
+ mm->owner = c;
+ task_unlock(c);
+ up_write(&mm->mmap_sem);
+ put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static void exit_mm(struct task_struct * tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct core_state *core_state;
+
+ mm_release(tsk, mm);
+ if (!mm)
+ return;
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold mmap_sem around checking core_state
+ * and clearing tsk->mm. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group with ->mm != NULL.
+ */
+ down_read(&mm->mmap_sem);
+ core_state = mm->core_state;
+ if (core_state) {
+ struct core_thread self;
+ up_read(&mm->mmap_sem);
+
+ self.task = tsk;
+ self.next = xchg(&core_state->dumper.next, &self);
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
+ }
+ __set_task_state(tsk, TASK_RUNNING);
+ down_read(&mm->mmap_sem);
+ }
+ atomic_inc(&mm->mm_count);
+ BUG_ON(mm != tsk->active_mm);
+ /* more a memory barrier than a real lock */
+ task_lock(tsk);
+ tsk->mm = NULL;
+ up_read(&mm->mmap_sem);
+ enter_lazy_tlb(mm, current);
+ /* We don't want this task to be frozen prematurely */
+ clear_freeze_flag(tsk);
+ task_unlock(tsk);
+ mm_update_next_owner(mm);
+ mmput(mm);
+}
+
+/*
+ * Return nonzero if @parent's children should reap themselves.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static int ignoring_children(struct task_struct *parent)
+{
+ int ret;
+ struct sighand_struct *psig = parent->sighand;
+ unsigned long flags;
+ spin_lock_irqsave(&psig->siglock, flags);
+ ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+ (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
+ spin_unlock_irqrestore(&psig->siglock, flags);
+ return ret;
+}
+
+/*
+ * Detach all tasks we were using ptrace on.
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
+{
+ struct task_struct *p, *n;
+ int ign = -1;
+
+ list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
+ __ptrace_unlink(p);
+
+ if (p->exit_state != EXIT_ZOMBIE)
+ continue;
+
+ /*
+ * If it's a zombie, our attachedness prevented normal
+ * parent notification or self-reaping. Do notification
+ * now if it would have happened earlier. If it should
+ * reap itself, add it to the @dead list. We can't call
+ * release_task() here because we already hold tasklist_lock.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+ if (!task_detached(p) && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, parent))
+ do_notify_parent(p, p->exit_signal);
+ else {
+ if (ign < 0)
+ ign = ignoring_children(parent);
+ if (ign)
+ p->exit_signal = -1;
+ }
+ }
+
+ if (task_detached(p)) {
+ /*
+ * Mark it as in the process of being reaped.
+ */
+ p->exit_state = EXIT_DEAD;
+ list_add(&p->ptrace_entry, dead);
+ }
+ }
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+ struct list_head *dead)
+{
+ struct task_struct *p, *n;
+
+ BUG_ON(!list_empty(&parent->ptraced));
+
+ list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);
+
+ /* If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (same_thread_group(p->real_parent, father))
+ return;
+
+ /* We don't want people slaying init. */
+ if (!task_detached(p))
+ p->exit_signal = SIGCHLD;
+
+ /* If we'd notified the old parent about this child's death,
+ * also notify the new parent.
+ */
+ if (!ptrace_reparented(p) &&
+ p->exit_state == EXIT_ZOMBIE &&
+ !task_detached(p) && thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+
+ kill_orphaned_pgrp(p, father);
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the child reaper process (ie "init") in our pid
+ * space.
+ */
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *thread;
+
+ thread = father;
+ while_each_thread(father, thread) {
+ if (thread->flags & PF_EXITING)
+ continue;
+ if (unlikely(pid_ns->child_reaper == father))
+ pid_ns->child_reaper = thread;
+ return thread;
+ }
+
+ if (unlikely(pid_ns->child_reaper == father)) {
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns))
+ panic("Attempted to kill init!");
+
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+ /*
+ * We can not clear ->child_reaper or leave it alone.
+ * There may by stealth EXIT_DEAD tasks on ->children,
+ * forget_original_parent() must move them somewhere.
+ */
+ pid_ns->child_reaper = init_pid_ns.child_reaper;
+ }
+
+ return pid_ns->child_reaper;
+}
+
+static void forget_original_parent(struct task_struct *father)
+{
+ struct task_struct *p, *n, *reaper;
+ LIST_HEAD(ptrace_dead);
+
+ write_lock_irq(&tasklist_lock);
+ reaper = find_new_reaper(father);
+ /*
+ * First clean up ptrace if we were using it.
+ */
+ ptrace_exit(father, &ptrace_dead);
+
+ list_for_each_entry_safe(p, n, &father->children, sibling) {
+ p->real_parent = reaper;
+ if (p->parent == father) {
+ BUG_ON(p->ptrace);
+ p->parent = p->real_parent;
+ }
+ reparent_thread(p, father);
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ BUG_ON(!list_empty(&father->children));
+
+ ptrace_exit_finish(father, &ptrace_dead);
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk, int group_dead)
+{
+ int signal;
+ void *cookie;
+
+ /*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+ forget_original_parent(tsk);
+ exit_task_namespaces(tsk);
+
+ write_lock_irq(&tasklist_lock);
+ if (group_dead)
+ kill_orphaned_pgrp(tsk->group_leader, NULL);
+
+ /* Let father know we died
+ *
+ * Thread signals are configurable, but you aren't going to use
+ * that to send signals to arbitary processes.
+ * That stops right now.
+ *
+ * If the parent exec id doesn't match the exec id we saved
+ * when we started then we know the parent has changed security
+ * domain.
+ *
+ * If our self_exec id doesn't match our parent_exec_id then
+ * we have changed execution domain as these two values started
+ * the same after a fork.
+ */
+ if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+ (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
+ tsk->self_exec_id != tsk->parent_exec_id) &&
+ !capable(CAP_KILL))
+ tsk->exit_signal = SIGCHLD;
+
+ signal = tracehook_notify_death(tsk, &cookie, group_dead);
+ if (signal >= 0)
+ signal = do_notify_parent(tsk, signal);
+
+ tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+
+ /* mt-exec, de_thread() is waiting for us */
+ if (thread_group_leader(tsk) &&
+ tsk->signal->group_exit_task &&
+ tsk->signal->notify_count < 0)
+ wake_up_process(tsk->signal->group_exit_task);
+
+ write_unlock_irq(&tasklist_lock);
+
+ tracehook_report_death(tsk, signal, cookie, group_dead);
+
+ /* If the process is dead, release it - nobody will wait for it */
+ if (signal == DEATH_REAP)
+ release_task(tsk);
+}
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static void check_stack_usage(void)
+{
+ static DEFINE_SPINLOCK(low_water_lock);
+ static int lowest_to_date = THREAD_SIZE;
+ unsigned long *n = end_of_stack(current);
+ unsigned long free;
+
+ while (*n == 0)
+ n++;
+ free = (unsigned long)n - (unsigned long)end_of_stack(current);
+
+ if (free >= lowest_to_date)
+ return;
+
+ spin_lock(&low_water_lock);
+ if (free < lowest_to_date) {
+ printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
+ "left\n",
+ current->comm, free);
+ lowest_to_date = free;
+ }
+ spin_unlock(&low_water_lock);
+}
+#else
+static inline void check_stack_usage(void) {}
+#endif
+
+NORET_TYPE void do_exit(long code)
+{
+ struct task_struct *tsk = current;
+ int group_dead;
+
+ profile_task_exit(tsk);
+
+ WARN_ON(atomic_read(&tsk->fs_excl));
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+
+ tracehook_report_exit(&code);
+
+ /*
+ * We're taking recursive faults here in do_exit. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ printk(KERN_ALERT
+ "Fixing recursive fault but reboot is needed!\n");
+ /*
+ * We can do this unlocked here. The futex code uses
+ * this flag just to verify whether the pi state
+ * cleanup has been done or not. In the worst case it
+ * loops once more. We pretend that the cleanup was
+ * done as there is no way to return. Either the
+ * OWNER_DIED bit is set by now or we push the blocked
+ * task into the wait for ever nirwana as well.
+ */
+ tsk->flags |= PF_EXITPIDONE;
+ if (tsk->io_context)
+ exit_io_context();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ }
+
+ exit_signals(tsk); /* sets PF_EXITING */
+ /*
+ * tsk->flags are checked in the futex code to protect against
+ * an exiting task cleaning up the robust pi futexes.
+ */
+ smp_mb();
+ spin_unlock_wait(&tsk->pi_lock);
+
+ if (unlikely(in_atomic()))
+ printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+
+ acct_update_integrals(tsk);
+ if (tsk->mm) {
+ update_hiwater_rss(tsk->mm);
+ update_hiwater_vm(tsk->mm);
+ }
+ group_dead = atomic_dec_and_test(&tsk->signal->live);
+ if (group_dead) {
+ hrtimer_cancel(&tsk->signal->real_timer);
+ exit_itimers(tsk->signal);
+ }
+ acct_collect(code, group_dead);
+ if (group_dead)
+ tty_audit_exit();
+ if (unlikely(tsk->audit_context))
+ audit_free(tsk);
+
+ tsk->exit_code = code;
+ taskstats_exit(tsk, group_dead);
+
+ exit_mm(tsk);
+
+ if (group_dead)
+ acct_process();
+ trace_sched_process_exit(tsk);
+
+ exit_sem(tsk);
+ exit_files(tsk);
+ exit_fs(tsk);
+ check_stack_usage();
+ exit_thread();
+ cgroup_exit(tsk, 1);
+ exit_keys(tsk);
+
+ if (group_dead && tsk->signal->leader)
+ disassociate_ctty(1);
+
+ module_put(task_thread_info(tsk)->exec_domain->module);
+ if (tsk->binfmt)
+ module_put(tsk->binfmt->module);
+
+ proc_exit_connector(tsk);
+ exit_notify(tsk, group_dead);
+#ifdef CONFIG_NUMA
+ mpol_put(tsk->mempolicy);
+ tsk->mempolicy = NULL;
+#endif
+#ifdef CONFIG_FUTEX
+ /*
+ * This must happen late, after the PID is not
+ * hashed anymore:
+ */
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+ if (unlikely(current->pi_state_cache))
+ kfree(current->pi_state_cache);
+#endif
+ /*
+ * Make sure we are holding no locks:
+ */
+ debug_check_no_locks_held(tsk);
+ /*
+ * We can do this unlocked here. The futex code uses this flag
+ * just to verify whether the pi state cleanup has been done
+ * or not. In the worst case it loops once more.
+ */
+ tsk->flags |= PF_EXITPIDONE;
+
+ if (tsk->io_context)
+ exit_io_context();
+
+ if (tsk->splice_pipe)
+ __free_pipe_info(tsk->splice_pipe);
+
+ preempt_disable();
+ /* causes final put_task_struct in finish_task_switch(). */
+ tsk->state = TASK_DEAD;
+
+ schedule();
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
+}
+
+EXPORT_SYMBOL_GPL(do_exit);
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+SYSCALL_DEFINE1(exit, int, error_code)
+{
+ do_exit((error_code&0xff)<<8);
+}
+
+/*
+ * Take down every thread in the group. This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+ struct signal_struct *sig = current->signal;
+
+ BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+ if (signal_group_exit(sig))
+ exit_code = sig->group_exit_code;
+ else if (!thread_group_empty(current)) {
+ struct sighand_struct *const sighand = current->sighand;
+ spin_lock_irq(&sighand->siglock);
+ if (signal_group_exit(sig))
+ /* Another thread got here before we took the lock. */
+ exit_code = sig->group_exit_code;
+ else {
+ sig->group_exit_code = exit_code;
+ sig->flags = SIGNAL_GROUP_EXIT;
+ zap_other_threads(current);
+ }
+ spin_unlock_irq(&sighand->siglock);
+ }
+
+ do_exit(exit_code);
+ /* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+SYSCALL_DEFINE1(exit_group, int, error_code)
+{
+ do_group_exit((error_code & 0xff) << 8);
+ /* NOTREACHED */
+ return 0;
+}
+
+static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid = NULL;
+ if (type == PIDTYPE_PID)
+ pid = task->pids[type].pid;
+ else if (type < PIDTYPE_MAX)
+ pid = task->group_leader->pids[type].pid;
+ return pid;
+}
+
+static int eligible_child(enum pid_type type, struct pid *pid, int options,
+ struct task_struct *p)
+{
+ int err;
+
+ if (type < PIDTYPE_MAX) {
+ if (task_pid_type(p, type) != pid)
+ return 0;
+ }
+
+ /* Wait for all children (clone and not) if __WALL is set;
+ * otherwise, wait for clone children *only* if __WCLONE is
+ * set; otherwise, wait for non-clone children *only*. (Note:
+ * A "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD.) */
+ if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ && !(options & __WALL))
+ return 0;
+
+ err = security_task_wait(p);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
+ int why, int status,
+ struct siginfo __user *infop,
+ struct rusage __user *rusagep)
+{
+ int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+
+ put_task_struct(p);
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ if (!retval)
+ retval = pid;
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(struct task_struct *p, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ unsigned long state;
+ int retval, status, traced;
+ pid_t pid = task_pid_vnr(p);
+
+ if (!likely(options & WEXITED))
+ return 0;
+
+ if (unlikely(options & WNOWAIT)) {
+ uid_t uid = p->uid;
+ int exit_code = p->exit_code;
+ int why, status;
+
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+ if ((exit_code & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status = exit_code >> 8;
+ } else {
+ why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status = exit_code & 0x7f;
+ }
+ return wait_noreap_copyout(p, pid, uid, why,
+ status, infop, ru);
+ }
+
+ /*
+ * Try to move the task's state to DEAD
+ * only one thread is allowed to do this:
+ */
+ state = xchg(&p->exit_state, EXIT_DEAD);
+ if (state != EXIT_ZOMBIE) {
+ BUG_ON(state != EXIT_DEAD);
+ return 0;
+ }
+
+ traced = ptrace_reparented(p);
+
+ if (likely(!traced)) {
+ struct signal_struct *psig;
+ struct signal_struct *sig;
+ struct task_cputime cputime;
+
+ /*
+ * The resource counters for the group leader are in its
+ * own task_struct. Those for dead threads in the group
+ * are in its signal_struct, as are those for the child
+ * processes it has previously reaped. All these
+ * accumulate in the parent's signal_struct c* fields.
+ *
+ * We don't bother to take a lock here to protect these
+ * p->signal fields, because they are only touched by
+ * __exit_signal, which runs with tasklist_lock
+ * write-locked anyway, and so is excluded here. We do
+ * need to protect the access to p->parent->signal fields,
+ * as other threads in the parent group can be right
+ * here reaping other children at the same time.
+ *
+ * We use thread_group_cputime() to get times for the thread
+ * group, which consolidates times for all threads in the
+ * group including the group leader.
+ */
+ spin_lock_irq(&p->parent->sighand->siglock);
+ psig = p->parent->signal;
+ sig = p->signal;
+ thread_group_cputime(p, &cputime);
+ psig->cutime =
+ cputime_add(psig->cutime,
+ cputime_add(cputime.utime,
+ sig->cutime));
+ psig->cstime =
+ cputime_add(psig->cstime,
+ cputime_add(cputime.stime,
+ sig->cstime));
+ psig->cgtime =
+ cputime_add(psig->cgtime,
+ cputime_add(p->gtime,
+ cputime_add(sig->gtime,
+ sig->cgtime)));
+ psig->cmin_flt +=
+ p->min_flt + sig->min_flt + sig->cmin_flt;
+ psig->cmaj_flt +=
+ p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+ psig->cnvcsw +=
+ p->nvcsw + sig->nvcsw + sig->cnvcsw;
+ psig->cnivcsw +=
+ p->nivcsw + sig->nivcsw + sig->cnivcsw;
+ psig->cinblock +=
+ task_io_get_inblock(p) +
+ sig->inblock + sig->cinblock;
+ psig->coublock +=
+ task_io_get_oublock(p) +
+ sig->oublock + sig->coublock;
+ task_io_accounting_add(&psig->ioac, &p->ioac);
+ task_io_accounting_add(&psig->ioac, &sig->ioac);
+ spin_unlock_irq(&p->parent->sighand->siglock);
+ }
+
+ /*
+ * Now we are sure this task is interesting, and no other
+ * thread can reap it because we set its state to EXIT_DEAD.
+ */
+ read_unlock(&tasklist_lock);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
+ if (!retval && stat_addr)
+ retval = put_user(status, stat_addr);
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop) {
+ int why;
+
+ if ((status & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status >>= 8;
+ } else {
+ why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status &= 0x7f;
+ }
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
+ if (!retval && infop)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(p->uid, &infop->si_uid);
+ if (!retval)
+ retval = pid;
+
+ if (traced) {
+ write_lock_irq(&tasklist_lock);
+ /* We dropped tasklist, ptracer could die and untrace */
+ ptrace_unlink(p);
+ /*
+ * If this is not a detached task, notify the parent.
+ * If it's still not detached after that, don't release
+ * it now.
+ */
+ if (!task_detached(p)) {
+ do_notify_parent(p, p->exit_signal);
+ if (!task_detached(p)) {
+ p->exit_state = EXIT_ZOMBIE;
+ p = NULL;
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ }
+ if (p != NULL)
+ release_task(p);
+
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(int ptrace, struct task_struct *p,
+ int options, struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int retval, exit_code, why;
+ uid_t uid = 0; /* unneeded, required by compiler */
+ pid_t pid;
+
+ if (!(options & WUNTRACED))
+ return 0;
+
+ exit_code = 0;
+ spin_lock_irq(&p->sighand->siglock);
+
+ if (unlikely(!task_is_stopped_or_traced(p)))
+ goto unlock_sig;
+
+ if (!ptrace && p->signal->group_stop_count > 0)
+ /*
+ * A group stop is in progress and this is the group leader.
+ * We won't report until all threads have stopped.
+ */
+ goto unlock_sig;
+
+ exit_code = p->exit_code;
+ if (!exit_code)
+ goto unlock_sig;
+
+ if (!unlikely(options & WNOWAIT))
+ p->exit_code = 0;
+
+ uid = p->uid;
+unlock_sig:
+ spin_unlock_irq(&p->sighand->siglock);
+ if (!exit_code)
+ return 0;
+
+ /*
+ * Now we are pretty sure this task is interesting.
+ * Make sure it doesn't get reaped out from under us while we
+ * give up the lock and then examine it below. We don't want to
+ * keep holding onto the tasklist_lock while we call getrusage and
+ * possibly take page faults for user memory.
+ */
+ get_task_struct(p);
+ pid = task_pid_vnr(p);
+ why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
+ read_unlock(&tasklist_lock);
+
+ if (unlikely(options & WNOWAIT))
+ return wait_noreap_copyout(p, pid, uid,
+ why, exit_code,
+ infop, ru);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr)
+ retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval && infop)
+ retval = put_user(exit_code, &infop->si_status);
+ if (!retval && infop)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = pid;
+ put_task_struct(p);
+
+ BUG_ON(!retval);
+ return retval;
+}
+
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(struct task_struct *p, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int retval;
+ pid_t pid;
+ uid_t uid;
+
+ if (!unlikely(options & WCONTINUED))
+ return 0;
+
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
+ return 0;
+
+ spin_lock_irq(&p->sighand->siglock);
+ /* Re-check with the lock held. */
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return 0;
+ }
+ if (!unlikely(options & WNOWAIT))
+ p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+ spin_unlock_irq(&p->sighand->siglock);
+
+ pid = task_pid_vnr(p);
+ uid = p->uid;
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ if (!infop) {
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ put_task_struct(p);
+ if (!retval && stat_addr)
+ retval = put_user(0xffff, stat_addr);
+ if (!retval)
+ retval = pid;
+ } else {
+ retval = wait_noreap_copyout(p, pid, uid,
+ CLD_CONTINUED, SIGCONT,
+ infop, ru);
+ BUG_ON(retval == 0);
+ }
+
+ return retval;
+}
+
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then *@notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent, int ptrace,
+ struct task_struct *p, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int ret = eligible_child(type, pid, options, p);
+ if (!ret)
+ return ret;
+
+ if (unlikely(ret < 0)) {
+ /*
+ * If we have not yet seen any eligible child,
+ * then let this error code replace -ECHILD.
+ * A permission error will give the user a clue
+ * to look for security policy problems, rather
+ * than for mysterious wait bugs.
+ */
+ if (*notask_error)
+ *notask_error = ret;
+ }
+
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /*
+ * This child is hidden by ptrace.
+ * We aren't allowed to see it now, but eventually we will.
+ */
+ *notask_error = 0;
+ return 0;
+ }
+
+ if (p->exit_state == EXIT_DEAD)
+ return 0;
+
+ /*
+ * We don't reap group leaders with subthreads.
+ */
+ if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+ return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+ /*
+ * It's stopped or running now, so it might
+ * later continue, exit, or stop again.
+ */
+ *notask_error = 0;
+
+ if (task_is_stopped_or_traced(p))
+ return wait_task_stopped(ptrace, p, options,
+ infop, stat_addr, ru);
+
+ return wait_task_continued(p, options, infop, stat_addr, ru);
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * *@notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->children, sibling) {
+ /*
+ * Do not consider detached threads.
+ */
+ if (!task_detached(p)) {
+ int ret = wait_consider_task(tsk, 0, p, notask_error,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ /*
+ * Traditionally we see ptrace'd stopped tasks regardless of options.
+ */
+ options |= WUNTRACED;
+
+ list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+ int ret = wait_consider_task(tsk, 1, p, notask_error,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static long do_wait(enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct task_struct *tsk;
+ int retval;
+
+ trace_sched_process_wait(pid);
+
+ add_wait_queue(&current->signal->wait_chldexit,&wait);
+repeat:
+ /*
+ * If there is nothing that can match our critiera just get out.
+ * We will clear @retval to zero if we see any child that might later
+ * match our criteria, even if we are not able to reap it yet.
+ */
+ retval = -ECHILD;
+ if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+ goto end;
+
+ current->state = TASK_INTERRUPTIBLE;
+ read_lock(&tasklist_lock);
+ tsk = current;
+ do {
+ int tsk_result = do_wait_thread(tsk, &retval,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (!tsk_result)
+ tsk_result = ptrace_do_wait(tsk, &retval,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (tsk_result) {
+ /*
+ * tasklist_lock is unlocked and we have a final result.
+ */
+ retval = tsk_result;
+ goto end;
+ }
+
+ if (options & __WNOTHREAD)
+ break;
+ tsk = next_thread(tsk);
+ BUG_ON(tsk->signal != current->signal);
+ } while (tsk != current);
+ read_unlock(&tasklist_lock);
+
+ if (!retval && !(options & WNOHANG)) {
+ retval = -ERESTARTSYS;
+ if (!signal_pending(current)) {
+ schedule();
+ goto repeat;
+ }
+ }
+
+end:
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&current->signal->wait_chldexit,&wait);
+ if (infop) {
+ if (retval > 0)
+ retval = 0;
+ else {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!retval)
+ retval = put_user(0, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user(0, &infop->si_code);
+ if (!retval)
+ retval = put_user(0, &infop->si_pid);
+ if (!retval)
+ retval = put_user(0, &infop->si_uid);
+ if (!retval)
+ retval = put_user(0, &infop->si_status);
+ }
+ }
+ return retval;
+}
+
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct pid *pid = NULL;
+ enum pid_type type;
+ long ret;
+
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ switch (which) {
+ case P_ALL:
+ type = PIDTYPE_MAX;
+ break;
+ case P_PID:
+ type = PIDTYPE_PID;
+ if (upid <= 0)
+ return -EINVAL;
+ break;
+ case P_PGID:
+ type = PIDTYPE_PGID;
+ if (upid <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (type < PIDTYPE_MAX)
+ pid = find_get_pid(upid);
+ ret = do_wait(type, pid, options, infop, NULL, ru);
+ put_pid(pid);
+
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(5, ret, which, upid, infop, options, ru);
+ return ret;
+}
+
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct pid *pid = NULL;
+ enum pid_type type;
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ if (upid == -1)
+ type = PIDTYPE_MAX;
+ else if (upid < 0) {
+ type = PIDTYPE_PGID;
+ pid = find_get_pid(-upid);
+ } else if (upid == 0) {
+ type = PIDTYPE_PGID;
+ pid = get_pid(task_pgrp(current));
+ } else /* upid > 0 */ {
+ type = PIDTYPE_PID;
+ pid = find_get_pid(upid);
+ }
+
+ ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+ put_pid(pid);
+
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
+ return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
+{
+ return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
new file mode 100644
index 0000000..a26cb2e
--- /dev/null
+++ b/kernel/extable.c
@@ -0,0 +1,68 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/sections.h>
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+/* Sort the kernel's built-in exception table */
+void __init sort_main_extable(void)
+{
+ sort_extable(__start___ex_table, __stop___ex_table);
+}
+
+/* Given an address, look for it in the exception tables. */
+const struct exception_table_entry *search_exception_tables(unsigned long addr)
+{
+ const struct exception_table_entry *e;
+
+ e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ if (!e)
+ e = search_module_extables(addr);
+ return e;
+}
+
+int core_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext &&
+ addr <= (unsigned long)_etext)
+ return 1;
+
+ if (system_state == SYSTEM_BOOTING &&
+ addr >= (unsigned long)_sinittext &&
+ addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+int __kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ return __module_text_address(addr) != NULL;
+}
+
+int kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ return module_text_address(addr) != NULL;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 0000000..8f753e5
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,1718 @@
+/*
+ * linux/kernel/fork.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * 'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/mnt_namespace.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/iocontext.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/fs.h>
+#include <linux/nsproxy.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/cgroup.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/tracehook.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/rcupdate.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/memcontrol.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/freezer.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
+#include <linux/random.h>
+#include <linux/tty.h>
+#include <linux/proc_fs.h>
+#include <linux/blkdev.h>
+#include <trace/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks; /* Handle normal Linux uptimes. */
+int nr_threads; /* The idle threads do not count.. */
+
+int max_threads; /* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+
+int nr_processes(void)
+{
+ int cpu;
+ int total = 0;
+
+ for_each_online_cpu(cpu)
+ total += per_cpu(process_counts, cpu);
+
+ return total;
+}
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+static struct kmem_cache *task_struct_cachep;
+#endif
+
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+ gfp_t mask = GFP_KERNEL;
+#endif
+ return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+
+static inline void free_thread_info(struct thread_info *ti)
+{
+ free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+static struct kmem_cache *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+struct kmem_cache *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+struct kmem_cache *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+struct kmem_cache *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+struct kmem_cache *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static struct kmem_cache *mm_cachep;
+
+void free_task(struct task_struct *tsk)
+{
+ prop_local_destroy_single(&tsk->dirties);
+ free_thread_info(tsk->stack);
+ rt_mutex_debug_task_free(tsk);
+ free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+void __put_task_struct(struct task_struct *tsk)
+{
+ WARN_ON(!tsk->exit_state);
+ WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
+ security_task_free(tsk);
+ free_uid(tsk->user);
+ put_group_info(tsk->group_info);
+ delayacct_tsk_free(tsk);
+
+ if (!profile_handoff_task(tsk))
+ free_task(tsk);
+}
+
+/*
+ * macro override instead of weak attribute alias, to workaround
+ * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
+ */
+#ifndef arch_task_cache_init
+#define arch_task_cache_init()
+#endif
+
+void __init fork_init(unsigned long mempages)
+{
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
+#endif
+ /* create a slab on which task_structs can be allocated */
+ task_struct_cachep =
+ kmem_cache_create("task_struct", sizeof(struct task_struct),
+ ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+#endif
+
+ /* do the arch specific task caches init */
+ arch_task_cache_init();
+
+ /*
+ * The default maximum number of threads is set to a safe
+ * value: the thread structures can take up at most half
+ * of memory.
+ */
+ max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+ /*
+ * we need to allow at least 20 threads to boot a system
+ */
+ if(max_threads < 20)
+ max_threads = 20;
+
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+ init_task.signal->rlim[RLIMIT_SIGPENDING] =
+ init_task.signal->rlim[RLIMIT_NPROC];
+}
+
+int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+ struct task_struct *src)
+{
+ *dst = *src;
+ return 0;
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+ struct task_struct *tsk;
+ struct thread_info *ti;
+ int err;
+
+ prepare_to_copy(orig);
+
+ tsk = alloc_task_struct();
+ if (!tsk)
+ return NULL;
+
+ ti = alloc_thread_info(tsk);
+ if (!ti) {
+ free_task_struct(tsk);
+ return NULL;
+ }
+
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
+ goto out;
+
+ tsk->stack = ti;
+
+ err = prop_local_init_single(&tsk->dirties);
+ if (err)
+ goto out;
+
+ setup_thread_stack(tsk, orig);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ tsk->stack_canary = get_random_int();
+#endif
+
+ /* One for us, one for whoever does the "release_task()" (usually parent) */
+ atomic_set(&tsk->usage,2);
+ atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ tsk->btrace_seq = 0;
+#endif
+ tsk->splice_pipe = NULL;
+ return tsk;
+
+out:
+ free_thread_info(ti);
+ free_task_struct(tsk);
+ return NULL;
+}
+
+#ifdef CONFIG_MMU
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct vm_area_struct *mpnt, *tmp, **pprev;
+ struct rb_node **rb_link, *rb_parent;
+ int retval;
+ unsigned long charge;
+ struct mempolicy *pol;
+
+ down_write(&oldmm->mmap_sem);
+ flush_cache_dup_mm(oldmm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+ mm->locked_vm = 0;
+ mm->mmap = NULL;
+ mm->mmap_cache = NULL;
+ mm->free_area_cache = oldmm->mmap_base;
+ mm->cached_hole_size = ~0UL;
+ mm->map_count = 0;
+ cpus_clear(mm->cpu_vm_mask);
+ mm->mm_rb = RB_ROOT;
+ rb_link = &mm->mm_rb.rb_node;
+ rb_parent = NULL;
+ pprev = &mm->mmap;
+
+ for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ struct file *file;
+
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ long pages = vma_pages(mpnt);
+ mm->total_vm -= pages;
+ vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+ -pages);
+ continue;
+ }
+ charge = 0;
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(len))
+ goto fail_nomem;
+ charge = len;
+ }
+ tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+ pol = mpol_dup(vma_policy(mpnt));
+ retval = PTR_ERR(pol);
+ if (IS_ERR(pol))
+ goto fail_nomem_policy;
+ vma_set_policy(tmp, pol);
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_mm = mm;
+ tmp->vm_next = NULL;
+ anon_vma_link(tmp);
+ file = tmp->vm_file;
+ if (file) {
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ if (tmp->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+ spin_lock(&mapping->i_mmap_lock);
+ if (tmp->vm_flags & VM_SHARED)
+ mapping->i_mmap_writable++;
+ tmp->vm_truncate_count = mpnt->vm_truncate_count;
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_prio_tree_add(tmp, mpnt);
+ flush_dcache_mmap_unlock(mapping);
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+
+ /*
+ * Clear hugetlb-related page reserves for children. This only
+ * affects MAP_PRIVATE mappings. Faults generated by the child
+ * are not guaranteed to succeed, even if read-only
+ */
+ if (is_vm_hugetlb_page(tmp))
+ reset_vma_resv_huge_pages(tmp);
+
+ /*
+ * Link in the new vma and copy the page table entries.
+ */
+ *pprev = tmp;
+ pprev = &tmp->vm_next;
+
+ __vma_link_rb(mm, tmp, rb_link, rb_parent);
+ rb_link = &tmp->vm_rb.rb_right;
+ rb_parent = &tmp->vm_rb;
+
+ mm->map_count++;
+ retval = copy_page_range(mm, oldmm, mpnt);
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ if (retval)
+ goto out;
+ }
+ /* a new mm has just been created */
+ arch_dup_mmap(oldmm, mm);
+ retval = 0;
+out:
+ up_write(&mm->mmap_sem);
+ flush_tlb_mm(oldmm);
+ up_write(&oldmm->mmap_sem);
+ return retval;
+fail_nomem_policy:
+ kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct * mm)
+{
+ mm->pgd = pgd_alloc(mm);
+ if (unlikely(!mm->pgd))
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct * mm)
+{
+ pgd_free(mm, mm->pgd);
+}
+#else
+#define dup_mmap(mm, oldmm) (0)
+#define mm_alloc_pgd(mm) (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+#include <linux/init_task.h>
+
+static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+{
+ atomic_set(&mm->mm_users, 1);
+ atomic_set(&mm->mm_count, 1);
+ init_rwsem(&mm->mmap_sem);
+ INIT_LIST_HEAD(&mm->mmlist);
+ mm->flags = (current->mm) ? current->mm->flags
+ : MMF_DUMP_FILTER_DEFAULT;
+ mm->core_state = NULL;
+ mm->nr_ptes = 0;
+ set_mm_counter(mm, file_rss, 0);
+ set_mm_counter(mm, anon_rss, 0);
+ spin_lock_init(&mm->page_table_lock);
+ rwlock_init(&mm->ioctx_list_lock);
+ mm->ioctx_list = NULL;
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
+ mm_init_owner(mm, p);
+
+ if (likely(!mm_alloc_pgd(mm))) {
+ mm->def_flags = 0;
+ mmu_notifier_mm_init(mm);
+ return mm;
+ }
+
+ free_mm(mm);
+ return NULL;
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+ struct mm_struct * mm;
+
+ mm = allocate_mm();
+ if (mm) {
+ memset(mm, 0, sizeof(*mm));
+ mm = mm_init(mm, current);
+ }
+ return mm;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
+ mmu_notifier_mm_destroy(mm);
+ free_mm(mm);
+}
+EXPORT_SYMBOL_GPL(__mmdrop);
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+ might_sleep();
+
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ exit_aio(mm);
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ put_swap_token(mm);
+ mmdrop(mm);
+ }
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count. User must release the mm via mmput()
+ * after use. Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (task->flags & PF_KTHREAD)
+ mm = NULL;
+ else
+ atomic_inc(&mm->mm_users);
+ }
+ task_unlock(task);
+ return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one. Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct completion *vfork_done = tsk->vfork_done;
+
+ /* Get rid of any futexes when releasing the mm */
+#ifdef CONFIG_FUTEX
+ if (unlikely(tsk->robust_list))
+ exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list))
+ compat_exit_robust_list(tsk);
+#endif
+#endif
+
+ /* Get rid of any cached register state */
+ deactivate_mm(tsk, mm);
+
+ /* notify parent sleeping on vfork() */
+ if (vfork_done) {
+ tsk->vfork_done = NULL;
+ complete(vfork_done);
+ }
+
+ /*
+ * If we're exiting normally, clear a user-space tid field if
+ * requested. We leave this alone when dying by signal, to leave
+ * the value intact in a core dump, and to save the unnecessary
+ * trouble otherwise. Userland only wants this done for a sys_exit.
+ */
+ if (tsk->clear_child_tid
+ && !(tsk->flags & PF_SIGNALED)
+ && atomic_read(&mm->mm_users) > 1) {
+ u32 __user * tidptr = tsk->clear_child_tid;
+ tsk->clear_child_tid = NULL;
+
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tidptr);
+ sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
+ }
+}
+
+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+ struct mm_struct *mm, *oldmm = current->mm;
+ int err;
+
+ if (!oldmm)
+ return NULL;
+
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ memcpy(mm, oldmm, sizeof(*mm));
+
+ /* Initializing for Swap token stuff */
+ mm->token_priority = 0;
+ mm->last_interval = 0;
+
+ if (!mm_init(mm, tsk))
+ goto fail_nomem;
+
+ if (init_new_context(tsk, mm))
+ goto fail_nocontext;
+
+ dup_mm_exe_file(oldmm, mm);
+
+ err = dup_mmap(mm, oldmm);
+ if (err)
+ goto free_pt;
+
+ mm->hiwater_rss = get_mm_rss(mm);
+ mm->hiwater_vm = mm->total_vm;
+
+ return mm;
+
+free_pt:
+ mmput(mm);
+
+fail_nomem:
+ return NULL;
+
+fail_nocontext:
+ /*
+ * If init_new_context() failed, we cannot use mmput() to free the mm
+ * because it calls destroy_context()
+ */
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return NULL;
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct mm_struct * mm, *oldmm;
+ int retval;
+
+ tsk->min_flt = tsk->maj_flt = 0;
+ tsk->nvcsw = tsk->nivcsw = 0;
+
+ tsk->mm = NULL;
+ tsk->active_mm = NULL;
+
+ /*
+ * Are we cloning a kernel thread?
+ *
+ * We need to steal a active VM for that..
+ */
+ oldmm = current->mm;
+ if (!oldmm)
+ return 0;
+
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+ goto good_mm;
+ }
+
+ retval = -ENOMEM;
+ mm = dup_mm(tsk);
+ if (!mm)
+ goto fail_nomem;
+
+good_mm:
+ /* Initializing for Swap token stuff */
+ mm->token_priority = 0;
+ mm->last_interval = 0;
+
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ return 0;
+
+fail_nomem:
+ return retval;
+}
+
+static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+ struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+ /* We don't need to lock fs - think why ;-) */
+ if (fs) {
+ atomic_set(&fs->count, 1);
+ rwlock_init(&fs->lock);
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->root = old->root;
+ path_get(&old->root);
+ fs->pwd = old->pwd;
+ path_get(&old->pwd);
+ read_unlock(&old->lock);
+ }
+ return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+ return __copy_fs_struct(old);
+}
+
+EXPORT_SYMBOL_GPL(copy_fs_struct);
+
+static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+{
+ if (clone_flags & CLONE_FS) {
+ atomic_inc(&current->fs->count);
+ return 0;
+ }
+ tsk->fs = __copy_fs_struct(current->fs);
+ if (!tsk->fs)
+ return -ENOMEM;
+ return 0;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct files_struct *oldf, *newf;
+ int error = 0;
+
+ /*
+ * A background process may not have any files ...
+ */
+ oldf = current->files;
+ if (!oldf)
+ goto out;
+
+ if (clone_flags & CLONE_FILES) {
+ atomic_inc(&oldf->count);
+ goto out;
+ }
+
+ newf = dup_fd(oldf, &error);
+ if (!newf)
+ goto out;
+
+ tsk->files = newf;
+ error = 0;
+out:
+ return error;
+}
+
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+#ifdef CONFIG_BLOCK
+ struct io_context *ioc = current->io_context;
+
+ if (!ioc)
+ return 0;
+ /*
+ * Share io context with parent, if CLONE_IO is set
+ */
+ if (clone_flags & CLONE_IO) {
+ tsk->io_context = ioc_task_link(ioc);
+ if (unlikely(!tsk->io_context))
+ return -ENOMEM;
+ } else if (ioprio_valid(ioc->ioprio)) {
+ tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+ if (unlikely(!tsk->io_context))
+ return -ENOMEM;
+
+ tsk->io_context->ioprio = ioc->ioprio;
+ }
+#endif
+ return 0;
+}
+
+static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct sighand_struct *sig;
+
+ if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+ atomic_inc(&current->sighand->count);
+ return 0;
+ }
+ sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ rcu_assign_pointer(tsk->sighand, sig);
+ if (!sig)
+ return -ENOMEM;
+ atomic_set(&sig->count, 1);
+ memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ return 0;
+}
+
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+ if (atomic_dec_and_test(&sighand->count))
+ kmem_cache_free(sighand_cachep, sighand);
+}
+
+
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+ /* Thread group counters. */
+ thread_group_cputime_init(sig);
+
+ /* Expiration times and increments. */
+ sig->it_virt_expires = cputime_zero;
+ sig->it_virt_incr = cputime_zero;
+ sig->it_prof_expires = cputime_zero;
+ sig->it_prof_incr = cputime_zero;
+
+ /* Cached expiration times. */
+ sig->cputime_expires.prof_exp = cputime_zero;
+ sig->cputime_expires.virt_exp = cputime_zero;
+ sig->cputime_expires.sched_exp = 0;
+
+ /* The timer lists. */
+ INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
+
+static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct signal_struct *sig;
+ int ret;
+
+ if (clone_flags & CLONE_THREAD) {
+ ret = thread_group_cputime_clone_thread(current);
+ if (likely(!ret)) {
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
+ }
+ return ret;
+ }
+ sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+ tsk->signal = sig;
+ if (!sig)
+ return -ENOMEM;
+
+ ret = copy_thread_group_keys(tsk);
+ if (ret < 0) {
+ kmem_cache_free(signal_cachep, sig);
+ return ret;
+ }
+
+ atomic_set(&sig->count, 1);
+ atomic_set(&sig->live, 1);
+ init_waitqueue_head(&sig->wait_chldexit);
+ sig->flags = 0;
+ sig->group_exit_code = 0;
+ sig->group_exit_task = NULL;
+ sig->group_stop_count = 0;
+ sig->curr_target = tsk;
+ init_sigpending(&sig->shared_pending);
+ INIT_LIST_HEAD(&sig->posix_timers);
+
+ hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ sig->it_real_incr.tv64 = 0;
+ sig->real_timer.function = it_real_fn;
+
+ sig->leader = 0; /* session leadership doesn't inherit */
+ sig->tty_old_pgrp = NULL;
+ sig->tty = NULL;
+
+ sig->cutime = sig->cstime = cputime_zero;
+ sig->gtime = cputime_zero;
+ sig->cgtime = cputime_zero;
+ sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+ sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+ sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+ task_io_accounting_init(&sig->ioac);
+ taskstats_tgid_init(sig);
+
+ task_lock(current->group_leader);
+ memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+ task_unlock(current->group_leader);
+
+ posix_cpu_timers_init_group(sig);
+
+ acct_init_pacct(&sig->pacct);
+
+ tty_audit_fork(sig);
+
+ return 0;
+}
+
+void __cleanup_signal(struct signal_struct *sig)
+{
+ thread_group_cputime_free(sig);
+ exit_thread_group_keys(sig);
+ tty_kref_put(sig->tty);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static void cleanup_signal(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+
+ atomic_dec(&sig->live);
+
+ if (atomic_dec_and_test(&sig->count))
+ __cleanup_signal(sig);
+}
+
+static void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+ unsigned long new_flags = p->flags;
+
+ new_flags &= ~PF_SUPERPRIV;
+ new_flags |= PF_FORKNOEXEC;
+ new_flags |= PF_STARTING;
+ p->flags = new_flags;
+ clear_freeze_flag(p);
+}
+
+SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
+{
+ current->clear_child_tid = tidptr;
+
+ return task_pid_vnr(current);
+}
+
+static void rt_mutex_init_task(struct task_struct *p)
+{
+ spin_lock_init(&p->pi_lock);
+#ifdef CONFIG_RT_MUTEXES
+ plist_head_init(&p->pi_waiters, &p->pi_lock);
+ p->pi_blocked_on = NULL;
+#endif
+}
+
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
+
+/*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+ tsk->cputime_expires.prof_exp = cputime_zero;
+ tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.sched_exp = 0;
+ INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static struct task_struct *copy_process(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *child_tidptr,
+ struct pid *pid,
+ int trace)
+{
+ int retval;
+ struct task_struct *p;
+ int cgroup_callbacks_done = 0;
+
+ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Thread groups must share signals as well, and detached threads
+ * can only be started up within the thread group.
+ */
+ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Shared signal handlers imply shared VM. By way of the above,
+ * thread groups also imply shared VM. Blocking this case allows
+ * for various simplifications in other code.
+ */
+ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+ return ERR_PTR(-EINVAL);
+
+ retval = security_task_create(clone_flags);
+ if (retval)
+ goto fork_out;
+
+ retval = -ENOMEM;
+ p = dup_task_struct(current);
+ if (!p)
+ goto fork_out;
+
+ rt_mutex_init_task(p);
+
+#ifdef CONFIG_PROVE_LOCKING
+ DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+ DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
+ retval = -EAGAIN;
+ if (atomic_read(&p->user->processes) >=
+ p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+ p->user != current->nsproxy->user_ns->root_user)
+ goto bad_fork_free;
+ }
+
+ atomic_inc(&p->user->__count);
+ atomic_inc(&p->user->processes);
+ get_group_info(p->group_info);
+
+ /*
+ * If multiple threads are within copy_process(), then this check
+ * triggers too late. This doesn't hurt, the check is only there
+ * to stop root fork bombs.
+ */
+ if (nr_threads >= max_threads)
+ goto bad_fork_cleanup_count;
+
+ if (!try_module_get(task_thread_info(p)->exec_domain->module))
+ goto bad_fork_cleanup_count;
+
+ if (p->binfmt && !try_module_get(p->binfmt->module))
+ goto bad_fork_cleanup_put_domain;
+
+ p->did_exec = 0;
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
+ copy_flags(clone_flags, p);
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
+#ifdef CONFIG_PREEMPT_RCU
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_flipctr_idx = 0;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ p->vfork_done = NULL;
+ spin_lock_init(&p->alloc_lock);
+
+ clear_tsk_thread_flag(p, TIF_SIGPENDING);
+ init_sigpending(&p->pending);
+
+ p->utime = cputime_zero;
+ p->stime = cputime_zero;
+ p->gtime = cputime_zero;
+ p->utimescaled = cputime_zero;
+ p->stimescaled = cputime_zero;
+ p->prev_utime = cputime_zero;
+ p->prev_stime = cputime_zero;
+
+ p->default_timer_slack_ns = current->timer_slack_ns;
+
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ p->last_switch_count = 0;
+ p->last_switch_timestamp = 0;
+#endif
+
+ task_io_accounting_init(&p->ioac);
+ acct_clear_integrals(p);
+
+ posix_cpu_timers_init(p);
+
+ p->lock_depth = -1; /* -1 = no lock */
+ do_posix_clock_monotonic_gettime(&p->start_time);
+ p->real_start_time = p->start_time;
+ monotonic_to_bootbased(&p->real_start_time);
+#ifdef CONFIG_SECURITY
+ p->security = NULL;
+#endif
+ p->cap_bset = current->cap_bset;
+ p->io_context = NULL;
+ p->audit_context = NULL;
+ cgroup_fork(p);
+#ifdef CONFIG_NUMA
+ p->mempolicy = mpol_dup(p->mempolicy);
+ if (IS_ERR(p->mempolicy)) {
+ retval = PTR_ERR(p->mempolicy);
+ p->mempolicy = NULL;
+ goto bad_fork_cleanup_cgroup;
+ }
+ mpol_fix_fork_child_flag(p);
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ p->hardirqs_enabled = 1;
+#else
+ p->hardirqs_enabled = 0;
+#endif
+ p->hardirq_enable_ip = 0;
+ p->hardirq_enable_event = 0;
+ p->hardirq_disable_ip = _THIS_IP_;
+ p->hardirq_disable_event = 0;
+ p->softirqs_enabled = 1;
+ p->softirq_enable_ip = _THIS_IP_;
+ p->softirq_enable_event = 0;
+ p->softirq_disable_ip = 0;
+ p->softirq_disable_event = 0;
+ p->hardirq_context = 0;
+ p->softirq_context = 0;
+#endif
+#ifdef CONFIG_LOCKDEP
+ p->lockdep_depth = 0; /* no locks held yet */
+ p->curr_chain_key = 0;
+ p->lockdep_recursion = 0;
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ p->blocked_on = NULL; /* not blocked yet */
+#endif
+
+ /* Perform scheduler related setup. Assign this task to a CPU. */
+ sched_fork(p, clone_flags);
+
+ if ((retval = security_task_alloc(p)))
+ goto bad_fork_cleanup_policy;
+ if ((retval = audit_alloc(p)))
+ goto bad_fork_cleanup_security;
+ /* copy all the process information */
+ if ((retval = copy_semundo(clone_flags, p)))
+ goto bad_fork_cleanup_audit;
+ if ((retval = copy_files(clone_flags, p)))
+ goto bad_fork_cleanup_semundo;
+ if ((retval = copy_fs(clone_flags, p)))
+ goto bad_fork_cleanup_files;
+ if ((retval = copy_sighand(clone_flags, p)))
+ goto bad_fork_cleanup_fs;
+ if ((retval = copy_signal(clone_flags, p)))
+ goto bad_fork_cleanup_sighand;
+ if ((retval = copy_mm(clone_flags, p)))
+ goto bad_fork_cleanup_signal;
+ if ((retval = copy_keys(clone_flags, p)))
+ goto bad_fork_cleanup_mm;
+ if ((retval = copy_namespaces(clone_flags, p)))
+ goto bad_fork_cleanup_keys;
+ if ((retval = copy_io(clone_flags, p)))
+ goto bad_fork_cleanup_namespaces;
+ retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+ if (retval)
+ goto bad_fork_cleanup_io;
+
+ if (pid != &init_struct_pid) {
+ retval = -ENOMEM;
+ pid = alloc_pid(task_active_pid_ns(p));
+ if (!pid)
+ goto bad_fork_cleanup_io;
+
+ if (clone_flags & CLONE_NEWPID) {
+ retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+ if (retval < 0)
+ goto bad_fork_free_pid;
+ }
+ }
+
+ p->pid = pid_nr(pid);
+ p->tgid = p->pid;
+ if (clone_flags & CLONE_THREAD)
+ p->tgid = current->tgid;
+
+ if (current->nsproxy != p->nsproxy) {
+ retval = ns_cgroup_clone(p, pid);
+ if (retval)
+ goto bad_fork_free_pid;
+ }
+
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_FUTEX
+ p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+ p->compat_robust_list = NULL;
+#endif
+ INIT_LIST_HEAD(&p->pi_state_list);
+ p->pi_state_cache = NULL;
+#endif
+ /*
+ * sigaltstack should be cleared when sharing the same VM
+ */
+ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+ p->sas_ss_sp = p->sas_ss_size = 0;
+
+ /*
+ * Syscall tracing should be turned off in the child regardless
+ * of CLONE_PTRACE.
+ */
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
+ clear_all_latency_tracing(p);
+
+ /* ok, now we should be set up.. */
+ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+ p->pdeath_signal = 0;
+ p->exit_state = 0;
+
+ /*
+ * Ok, make it visible to the rest of the system.
+ * We dont wake it up yet.
+ */
+ p->group_leader = p;
+ INIT_LIST_HEAD(&p->thread_group);
+
+ /* Now that the task is set up, run cgroup callbacks if
+ * necessary. We need to run them before the task is visible
+ * on the tasklist. */
+ cgroup_fork_callbacks(p);
+ cgroup_callbacks_done = 1;
+
+ /* Need tasklist lock for parent etc handling! */
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * The task hasn't been attached yet, so its cpus_allowed mask will
+ * not be changed, nor will its assigned CPU.
+ *
+ * The cpus_allowed mask of the parent may have changed after it was
+ * copied first time - so re-copy it here, then check the child's CPU
+ * to ensure it is on a valid CPU (and if not, just force it back to
+ * parent's CPU). This avoids alot of nasty races.
+ */
+ p->cpus_allowed = current->cpus_allowed;
+ p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
+ if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+ !cpu_online(task_cpu(p))))
+ set_task_cpu(p, smp_processor_id());
+
+ /* CLONE_PARENT re-uses the old parent */
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+ p->real_parent = current->real_parent;
+ p->parent_exec_id = current->parent_exec_id;
+ } else {
+ p->real_parent = current;
+ p->parent_exec_id = current->self_exec_id;
+ }
+
+ spin_lock(&current->sighand->siglock);
+
+ /*
+ * Process group and session signals need to be delivered to just the
+ * parent before the fork or both the parent and the child after the
+ * fork. Restart if a signal comes in before we add the new process to
+ * it's process group.
+ * A fatal signal pending means that current will exit, so the new
+ * thread can't slip out of an OOM kill (or normal SIGKILL).
+ */
+ recalc_sigpending();
+ if (signal_pending(current)) {
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+ retval = -ERESTARTNOINTR;
+ goto bad_fork_free_pid;
+ }
+
+ if (clone_flags & CLONE_THREAD) {
+ p->group_leader = current->group_leader;
+ list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
+ }
+
+ if (likely(p->pid)) {
+ list_add_tail(&p->sibling, &p->real_parent->children);
+ tracehook_finish_clone(p, clone_flags, trace);
+
+ if (thread_group_leader(p)) {
+ if (clone_flags & CLONE_NEWPID)
+ p->nsproxy->pid_ns->child_reaper = p;
+
+ p->signal->leader_pid = pid;
+ tty_kref_put(p->signal->tty);
+ p->signal->tty = tty_kref_get(current->signal->tty);
+ set_task_pgrp(p, task_pgrp_nr(current));
+ set_task_session(p, task_session_nr(current));
+ attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ attach_pid(p, PIDTYPE_SID, task_session(current));
+ list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ __get_cpu_var(process_counts)++;
+ }
+ attach_pid(p, PIDTYPE_PID, pid);
+ nr_threads++;
+ }
+
+ total_forks++;
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+ proc_fork_connector(p);
+ cgroup_post_fork(p);
+ return p;
+
+bad_fork_free_pid:
+ if (pid != &init_struct_pid)
+ free_pid(pid);
+bad_fork_cleanup_io:
+ put_io_context(p->io_context);
+bad_fork_cleanup_namespaces:
+ exit_task_namespaces(p);
+bad_fork_cleanup_keys:
+ exit_keys(p);
+bad_fork_cleanup_mm:
+ if (p->mm)
+ mmput(p->mm);
+bad_fork_cleanup_signal:
+ cleanup_signal(p);
+bad_fork_cleanup_sighand:
+ __cleanup_sighand(p->sighand);
+bad_fork_cleanup_fs:
+ exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+ exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+ exit_sem(p);
+bad_fork_cleanup_audit:
+ audit_free(p);
+bad_fork_cleanup_security:
+ security_task_free(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+ mpol_put(p->mempolicy);
+bad_fork_cleanup_cgroup:
+#endif
+ cgroup_exit(p, cgroup_callbacks_done);
+ delayacct_tsk_free(p);
+ if (p->binfmt)
+ module_put(p->binfmt->module);
+bad_fork_cleanup_put_domain:
+ module_put(task_thread_info(p)->exec_domain->module);
+bad_fork_cleanup_count:
+ put_group_info(p->group_info);
+ atomic_dec(&p->user->processes);
+ free_uid(p->user);
+bad_fork_free:
+ free_task(p);
+fork_out:
+ return ERR_PTR(retval);
+}
+
+noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+ memset(regs, 0, sizeof(struct pt_regs));
+ return regs;
+}
+
+struct task_struct * __cpuinit fork_idle(int cpu)
+{
+ struct task_struct *task;
+ struct pt_regs regs;
+
+ task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
+ &init_struct_pid, 0);
+ if (!IS_ERR(task))
+ init_idle(task, cpu);
+
+ return task;
+}
+
+/*
+ * Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ struct task_struct *p;
+ int trace = 0;
+ long nr;
+
+ /*
+ * We hope to recycle these flags after 2.6.26
+ */
+ if (unlikely(clone_flags & CLONE_STOPPED)) {
+ static int __read_mostly count = 100;
+
+ if (count > 0 && printk_ratelimit()) {
+ char comm[TASK_COMM_LEN];
+
+ count--;
+ printk(KERN_INFO "fork(): process `%s' used deprecated "
+ "clone flags 0x%lx\n",
+ get_task_comm(comm, current),
+ clone_flags & CLONE_STOPPED);
+ }
+ }
+
+ /*
+ * When called from kernel_thread, don't do user tracing stuff.
+ */
+ if (likely(user_mode(regs)))
+ trace = tracehook_prepare_clone(clone_flags);
+
+ p = copy_process(clone_flags, stack_start, regs, stack_size,
+ child_tidptr, NULL, trace);
+ /*
+ * Do this prior waking up the new thread - the thread pointer
+ * might get invalid after that point, if the thread exits quickly.
+ */
+ if (!IS_ERR(p)) {
+ struct completion vfork;
+
+ trace_sched_process_fork(current, p);
+
+ nr = task_pid_vnr(p);
+
+ if (clone_flags & CLONE_PARENT_SETTID)
+ put_user(nr, parent_tidptr);
+
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ }
+
+ audit_finish_fork(p);
+ tracehook_report_clone(trace, regs, clone_flags, nr, p);
+
+ /*
+ * We set PF_STARTING at creation in case tracing wants to
+ * use this to distinguish a fully live task from one that
+ * hasn't gotten to tracehook_report_clone() yet. Now we
+ * clear it and set the child going.
+ */
+ p->flags &= ~PF_STARTING;
+
+ if (unlikely(clone_flags & CLONE_STOPPED)) {
+ /*
+ * We'll start up with an immediate SIGSTOP.
+ */
+ sigaddset(&p->pending.signal, SIGSTOP);
+ set_tsk_thread_flag(p, TIF_SIGPENDING);
+ __set_task_state(p, TASK_STOPPED);
+ } else {
+ wake_up_new_task(p, clone_flags);
+ }
+
+ tracehook_report_clone_complete(trace, regs,
+ clone_flags, nr, p);
+
+ if (clone_flags & CLONE_VFORK) {
+ freezer_do_not_count();
+ wait_for_completion(&vfork);
+ freezer_count();
+ tracehook_report_vfork_done(p, nr);
+ }
+ } else {
+ nr = PTR_ERR(p);
+ }
+ return nr;
+}
+
+#ifndef ARCH_MIN_MMSTRUCT_ALIGN
+#define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+static void sighand_ctor(void *data)
+{
+ struct sighand_struct *sighand = data;
+
+ spin_lock_init(&sighand->siglock);
+ init_waitqueue_head(&sighand->signalfd_wqh);
+}
+
+void __init proc_caches_init(void)
+{
+ sighand_cachep = kmem_cache_create("sighand_cache",
+ sizeof(struct sighand_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+ sighand_ctor);
+ signal_cachep = kmem_cache_create("signal_cache",
+ sizeof(struct signal_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ files_cachep = kmem_cache_create("files_cache",
+ sizeof(struct files_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ fs_cachep = kmem_cache_create("fs_cache",
+ sizeof(struct fs_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), 0,
+ SLAB_PANIC, NULL);
+ mm_cachep = kmem_cache_create("mm_struct",
+ sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static void check_unshare_flags(unsigned long *flags_ptr)
+{
+ /*
+ * If unsharing a thread from a thread group, must also
+ * unshare vm.
+ */
+ if (*flags_ptr & CLONE_THREAD)
+ *flags_ptr |= CLONE_VM;
+
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (*flags_ptr & CLONE_VM)
+ *flags_ptr |= CLONE_SIGHAND;
+
+ /*
+ * If unsharing signal handlers and the task was created
+ * using CLONE_THREAD, then must unshare the thread
+ */
+ if ((*flags_ptr & CLONE_SIGHAND) &&
+ (atomic_read(&current->signal->count) > 1))
+ *flags_ptr |= CLONE_THREAD;
+
+ /*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (*flags_ptr & CLONE_NEWNS)
+ *flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+ if (unshare_flags & CLONE_THREAD)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Unshare the filesystem structure if it is being shared
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+ struct fs_struct *fs = current->fs;
+
+ if ((unshare_flags & CLONE_FS) &&
+ (fs && atomic_read(&fs->count) > 1)) {
+ *new_fsp = __copy_fs_struct(current->fs);
+ if (!*new_fsp)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Unsharing of sighand is not supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+ struct sighand_struct *sigh = current->sighand;
+
+ if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
+ return -EINVAL;
+ else
+ return 0;
+}
+
+/*
+ * Unshare vm if it is being shared
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+ struct mm_struct *mm = current->mm;
+
+ if ((unshare_flags & CLONE_VM) &&
+ (mm && atomic_read(&mm->mm_users) > 1)) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Unshare file descriptor table if it is being shared
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+ struct files_struct *fd = current->files;
+ int error = 0;
+
+ if ((unshare_flags & CLONE_FILES) &&
+ (fd && atomic_read(&fd->count) > 1)) {
+ *new_fdp = dup_fd(fd, &error);
+ if (!*new_fdp)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone. copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+{
+ int err = 0;
+ struct fs_struct *fs, *new_fs = NULL;
+ struct sighand_struct *new_sigh = NULL;
+ struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+ struct files_struct *fd, *new_fd = NULL;
+ struct nsproxy *new_nsproxy = NULL;
+ int do_sysvsem = 0;
+
+ check_unshare_flags(&unshare_flags);
+
+ /* Return -EINVAL for all unsupported flags */
+ err = -EINVAL;
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
+ CLONE_NEWNET))
+ goto bad_unshare_out;
+
+ /*
+ * CLONE_NEWIPC must also detach from the undolist: after switching
+ * to a new ipc namespace, the semaphore arrays from the old
+ * namespace are unreachable.
+ */
+ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
+ do_sysvsem = 1;
+ if ((err = unshare_thread(unshare_flags)))
+ goto bad_unshare_out;
+ if ((err = unshare_fs(unshare_flags, &new_fs)))
+ goto bad_unshare_cleanup_thread;
+ if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+ goto bad_unshare_cleanup_fs;
+ if ((err = unshare_vm(unshare_flags, &new_mm)))
+ goto bad_unshare_cleanup_sigh;
+ if ((err = unshare_fd(unshare_flags, &new_fd)))
+ goto bad_unshare_cleanup_vm;
+ if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_fs)))
+ goto bad_unshare_cleanup_fd;
+
+ if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (do_sysvsem) {
+ /*
+ * CLONE_SYSVSEM is equivalent to sys_exit().
+ */
+ exit_sem(current);
+ }
+
+ if (new_nsproxy) {
+ switch_task_namespaces(current, new_nsproxy);
+ new_nsproxy = NULL;
+ }
+
+ task_lock(current);
+
+ if (new_fs) {
+ fs = current->fs;
+ current->fs = new_fs;
+ new_fs = fs;
+ }
+
+ if (new_mm) {
+ mm = current->mm;
+ active_mm = current->active_mm;
+ current->mm = new_mm;
+ current->active_mm = new_mm;
+ activate_mm(active_mm, new_mm);
+ new_mm = mm;
+ }
+
+ if (new_fd) {
+ fd = current->files;
+ current->files = new_fd;
+ new_fd = fd;
+ }
+
+ task_unlock(current);
+ }
+
+ if (new_nsproxy)
+ put_nsproxy(new_nsproxy);
+
+bad_unshare_cleanup_fd:
+ if (new_fd)
+ put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+ if (new_mm)
+ mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+ if (new_sigh)
+ if (atomic_dec_and_test(&new_sigh->count))
+ kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_fs:
+ if (new_fs)
+ put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+ return err;
+}
+
+/*
+ * Helper to unshare the files of the current task.
+ * We don't want to expose copy_files internals to
+ * the exec layer of the kernel.
+ */
+
+int unshare_files(struct files_struct **displaced)
+{
+ struct task_struct *task = current;
+ struct files_struct *copy = NULL;
+ int error;
+
+ error = unshare_fd(CLONE_FILES, &copy);
+ if (error || !copy) {
+ *displaced = NULL;
+ return error;
+ }
+ *displaced = task->files;
+ task_lock(task);
+ task->files = copy;
+ task_unlock(task);
+ return 0;
+}
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 0000000..2f4936c
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
+/*
+ * kernel/freezer.c - Function to freeze a process
+ *
+ * Originally from kernel/power/process.c
+ */
+
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+
+/*
+ * freezing is complete, mark current process as frozen
+ */
+static inline void frozen_process(void)
+{
+ if (!unlikely(current->flags & PF_NOFREEZE)) {
+ current->flags |= PF_FROZEN;
+ wmb();
+ }
+ clear_freeze_flag(current);
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(void)
+{
+ /* Hmm, should we be allowed to suspend when there are realtime
+ processes around? */
+ long save;
+
+ task_lock(current);
+ if (freezing(current)) {
+ frozen_process();
+ task_unlock(current);
+ } else {
+ task_unlock(current);
+ return;
+ }
+ save = current->state;
+ pr_debug("%s entered refrigerator\n", current->comm);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* We sent fake signal, clean it up */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!frozen(current))
+ break;
+ schedule();
+ }
+ pr_debug("%s left refrigerator\n", current->comm);
+ __set_current_state(save);
+}
+EXPORT_SYMBOL(refrigerator);
+
+static void fake_signal_wake_up(struct task_struct *p)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ signal_wake_up(p, 0);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+}
+
+/**
+ * freeze_task - send a freeze request to given task
+ * @p: task to send the request to
+ * @sig_only: if set, the request will only be sent if the task has the
+ * PF_FREEZER_NOSIG flag unset
+ * Return value: 'false', if @sig_only is set and the task has
+ * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ *
+ * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
+ * either sending a fake signal to it or waking it up, depending on whether
+ * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
+ * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
+ * TIF_FREEZE flag will not be set.
+ */
+bool freeze_task(struct task_struct *p, bool sig_only)
+{
+ /*
+ * We first check if the task is freezing and next if it has already
+ * been frozen to avoid the race with frozen_process() which first marks
+ * the task as frozen and next clears its TIF_FREEZE.
+ */
+ if (!freezing(p)) {
+ rmb();
+ if (frozen(p))
+ return false;
+
+ if (!sig_only || should_send_signal(p))
+ set_freeze_flag(p);
+ else
+ return false;
+ }
+
+ if (should_send_signal(p)) {
+ if (!signal_pending(p))
+ fake_signal_wake_up(p);
+ } else if (sig_only) {
+ return false;
+ } else {
+ wake_up_state(p, TASK_INTERRUPTIBLE);
+ }
+
+ return true;
+}
+
+void cancel_freezing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (freezing(p)) {
+ pr_debug(" clean up: %s\n", p->comm);
+ clear_freeze_flag(p);
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ recalc_sigpending_and_wake(p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+}
+
+static int __thaw_process(struct task_struct *p)
+{
+ if (frozen(p)) {
+ p->flags &= ~PF_FROZEN;
+ return 1;
+ }
+ clear_freeze_flag(p);
+ return 0;
+}
+
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails. Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
+int thaw_process(struct task_struct *p)
+{
+ task_lock(p);
+ if (__thaw_process(p) == 1) {
+ task_unlock(p);
+ wake_up_process(p);
+ return 1;
+ }
+ task_unlock(p);
+ return 0;
+}
+EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
new file mode 100644
index 0000000..c14b159
--- /dev/null
+++ b/kernel/futex.c
@@ -0,0 +1,2099 @@
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/futex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+
+#include <asm/futex.h>
+
+#include "rtmutex_common.h"
+
+int __read_mostly futex_cmpxchg_enabled;
+
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex pi_mutex;
+
+ struct task_struct *owner;
+ atomic_t refcount;
+
+ union futex_key key;
+};
+
+/*
+ * We use this hashed waitqueue instead of a normal wait_queue_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakup is always to make the first condition true, then
+ * wake up q->waiters, then make the second condition true.
+ */
+struct futex_q {
+ struct plist_node list;
+ wait_queue_head_t waiters;
+
+ /* Which hash list lock to use: */
+ spinlock_t *lock_ptr;
+
+ /* Key which the futex is hashed on: */
+ union futex_key key;
+
+ /* Optional priority inheritance state: */
+ struct futex_pi_state *pi_state;
+ struct task_struct *task;
+
+ /* Bitset for the optional bitmasked wakeup */
+ u32 bitset;
+};
+
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ spinlock_t lock;
+ struct plist_head chain;
+};
+
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+
+/*
+ * Take mm->mmap_sem, when futex is shared
+ */
+static inline void futex_lock_mm(struct rw_semaphore *fshared)
+{
+ if (fshared)
+ down_read(fshared);
+}
+
+/*
+ * Release mm->mmap_sem, when the futex is shared
+ */
+static inline void futex_unlock_mm(struct rw_semaphore *fshared)
+{
+ if (fshared)
+ up_read(fshared);
+}
+
+/*
+ * We hash on the keys returned from get_futex_key (see below).
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+ u32 hash = jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset);
+ return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+}
+
+/*
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int match_futex(union futex_key *key1, union futex_key *key2)
+{
+ return (key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ * &current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
+ *
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * offset_within_page). For private mappings, it's (uaddr, current->mm).
+ * We can usually work out the index without swapping in the page.
+ *
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to &current->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
+ */
+static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+ union futex_key *key)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ int err;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % sizeof(u32)) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
+ * The futex is hashed differently depending on whether
+ * it's in a shared or private mapping. So check vma first.
+ */
+ vma = find_extend_vma(mm, address);
+ if (unlikely(!vma))
+ return -EFAULT;
+
+ /*
+ * Permissions.
+ */
+ if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+ return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process. Therefore we use
+ * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
+ * mappings of _writable_ handles.
+ */
+ if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+
+ /*
+ * Linear file mappings are also simple.
+ */
+ key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
+ if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+ key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff);
+ return 0;
+ }
+
+ /*
+ * We could walk the page table to read the non-linear
+ * pte, and get the page index without fetching the page
+ * from swap. But that's a lot of code to duplicate here
+ * for a rare case, so we simply fetch the page.
+ */
+ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+ if (err >= 0) {
+ key->shared.pgoff =
+ page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ put_page(page);
+ return 0;
+ }
+ return err;
+}
+
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+ if (key->both.ptr == NULL)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ atomic_inc(&key->shared.inode->i_count);
+ break;
+ case FUT_OFF_MMSHARED:
+ atomic_inc(&key->private.mm->mm_count);
+ break;
+ }
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+ if (!key->both.ptr)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ iput(key->shared.inode);
+ break;
+ case FUT_OFF_MMSHARED:
+ mmdrop(key->private.mm);
+ break;
+ }
+}
+
+static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ u32 curval;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+
+ return curval;
+}
+
+static int get_futex_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+/*
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
+ */
+static int futex_handle_fault(unsigned long address,
+ struct rw_semaphore *fshared, int attempt)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
+
+ if (attempt > 2)
+ return ret;
+
+ if (!fshared)
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ int fault;
+ fault = handle_mm_fault(mm, vma, address, 1);
+ if (unlikely((fault & VM_FAULT_ERROR))) {
+#if 0
+ /* XXX: let's do this when we verify it is OK */
+ if (ret & VM_FAULT_OOM)
+ ret = -ENOMEM;
+#endif
+ } else {
+ ret = 0;
+ if (fault & VM_FAULT_MAJOR)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+ }
+ }
+ if (!fshared)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!atomic_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ }
+
+ if (current->pi_state_cache)
+ kfree(pi_state);
+ else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+ p = ERR_PTR(-ESRCH);
+ else
+ get_task_struct(p);
+
+ rcu_read_unlock();
+
+ return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = hash_futex(&key);
+ spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ spin_unlock(&hb->lock);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+ spin_unlock_irq(&curr->pi_lock);
+
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ spin_unlock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ }
+ spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+{
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_q *this, *next;
+ struct plist_head *head;
+ struct task_struct *p;
+ pid_t pid = uval & FUTEX_TID_MASK;
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex(&this->key, key)) {
+ /*
+ * Another waiter already exists - bump up
+ * the refcount and return its pi_state:
+ */
+ pi_state = this->pi_state;
+ /*
+ * Userspace might have messed up non PI and PI futexes
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(pid && pi_state->owner &&
+ pi_state->owner->pid != pid);
+
+ atomic_inc(&pi_state->refcount);
+ *ps = pi_state;
+
+ return 0;
+ }
+ }
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0
+ */
+ if (!pid)
+ return -ESRCH;
+ p = futex_find_get_task(pid);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ /*
+ * We need to look at the task state flags to figure out,
+ * whether the task is exiting. To protect against the do_exit
+ * change of the task flags, we do this protected by
+ * p->pi_lock:
+ */
+ spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->flags & PF_EXITING)) {
+ /*
+ * The task is on the way out. When PF_EXITPIDONE is
+ * set, we know that the task has finished the
+ * cleanup:
+ */
+ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+ spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+ return ret;
+ }
+
+ pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make 'p'
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+ spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ *ps = pi_state;
+
+ return 0;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed.
+ */
+static void wake_futex(struct futex_q *q)
+{
+ plist_del(&q->list, &q->list.plist);
+ /*
+ * The lock in wake_up_all() is a crucial memory barrier after the
+ * plist_del() and also before assigning to q->lock_ptr.
+ */
+ wake_up_all(&q->waiters);
+ /*
+ * The waiting task can free the futex_q as soon as this is written,
+ * without taking any locks. This must come last.
+ *
+ * A memory barrier is required here to prevent the following store
+ * to lock_ptr from getting ahead of the wakeup. Clearing the lock
+ * at the end of wake_up_all() does not prevent this store from
+ * moving.
+ */
+ smp_wmb();
+ q->lock_ptr = NULL;
+}
+
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+ struct task_struct *new_owner;
+ struct futex_pi_state *pi_state = this->pi_state;
+ u32 curval, newval;
+
+ if (!pi_state)
+ return -EINVAL;
+
+ spin_lock(&pi_state->pi_mutex.wait_lock);
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+ /*
+ * This happens when we have stolen the lock and the original
+ * pending owner did not enqueue itself back on the rt_mutex.
+ * Thats not a tragedy. We know that way, that a lock waiter
+ * is on the fly. We make the futex_q waiter the pending owner.
+ */
+ if (!new_owner)
+ new_owner = this->task;
+
+ /*
+ * We pass it to the next owner. (The WAITERS bit is always
+ * kept enabled while there is PI state around. We must also
+ * preserve the owner died bit.)
+ */
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ int ret = 0;
+
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
+ }
+ }
+
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ spin_lock_irq(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ spin_unlock_irq(&new_owner->pi_lock);
+
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+ u32 oldval;
+
+ /*
+ * There is no waiter, so we unlock the futex. The owner died
+ * bit has not to be preserved here. We are the owner:
+ */
+ oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
+
+ if (oldval == -EFAULT)
+ return oldval;
+ if (oldval != uval)
+ return -EAGAIN;
+
+ return 0;
+}
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 <= hb2) {
+ spin_lock(&hb1->lock);
+ if (hb1 < hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ } else { /* hb1 > hb2 */
+ spin_lock(&hb2->lock);
+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int nr_wake, u32 bitset)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ struct plist_head *head;
+ union futex_key key;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr, fshared, &key);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key)) {
+ if (this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+out:
+ futex_unlock_mm(fshared);
+ return ret;
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ int ret, op_ret, attempt = 0;
+
+retryfull:
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr1, fshared, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, fshared, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+retry:
+ double_lock_hb(hb1, hb2);
+
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ u32 dummy;
+
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+#ifndef CONFIG_MMU
+ /*
+ * we don't get EFAULT from MMU faults if we don't have an MMU,
+ * but we might get them from range checking
+ */
+ ret = op_ret;
+ goto out;
+#endif
+
+ if (unlikely(op_ret != -EFAULT)) {
+ ret = op_ret;
+ goto out;
+ }
+
+ /*
+ * futex_atomic_op_inuser needs to both read and write
+ * *(int __user *)uaddr2, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ fshared, attempt);
+ if (ret)
+ goto out;
+ goto retry;
+ }
+
+ /*
+ * If we would have faulted, release mmap_sem,
+ * fault it in and start all over again.
+ */
+ futex_unlock_mm(fshared);
+
+ ret = get_user(dummy, uaddr2);
+ if (ret)
+ return ret;
+
+ goto retryfull;
+ }
+
+ head = &hb1->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key1)) {
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ head = &hb2->chain;
+
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key2)) {
+ wake_futex(this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+out:
+ futex_unlock_mm(fshared);
+
+ return ret;
+}
+
+/*
+ * Requeue all waiters hashed on one physical page to another
+ * physical page.
+ */
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ int ret, drop_count = 0;
+
+ retry:
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr1, fshared, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, fshared, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ futex_unlock_mm(fshared);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
+ this->key = key2;
+ get_futex_key_refs(&key2);
+ drop_count++;
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_futex_key_refs(&key1);
+
+out:
+ futex_unlock_mm(fshared);
+ return ret;
+}
+
+/* The key must be already stored in q->key. */
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ init_waitqueue_head(&q->waiters);
+
+ get_futex_key_refs(&q->key);
+ hb = hash_futex(&q->key);
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock);
+ return hb;
+}
+
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
+ spin_unlock(&hb->lock);
+}
+
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ spin_unlock(&hb->lock);
+ drop_futex_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once. They are called with the hashed spinlock held.
+ */
+
+/* Return 1 if we were still queued (ie. 0 means we were woken) */
+static int unqueue_me(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+ retry:
+ lock_ptr = q->lock_ptr;
+ barrier();
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ drop_futex_key_refs(&q->key);
+ return ret;
+}
+
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q)
+{
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
+
+ BUG_ON(!q->pi_state);
+ free_pi_state(q->pi_state);
+ q->pi_state = NULL;
+
+ spin_unlock(q->lock_ptr);
+
+ drop_futex_key_refs(&q->key);
+}
+
+/*
+ * Fixup the pi_state owner with the new owner.
+ *
+ * Must be called with hash bucket lock held and mm->sem held for non
+ * private futexes.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *newowner,
+ struct rw_semaphore *fshared)
+{
+ u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner = pi_state->owner;
+ u32 uval, curval, newval;
+ int ret, attempt = 0;
+
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ /*
+ * We are here either because we stole the rtmutex from the
+ * pending owner or we are the pending owner which failed to
+ * get the rtmutex. We have to replace the pending owner TID
+ * in the user space variable. This must be atomic as we have
+ * to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the
+ * pi_state because we can fault here. Imagine swapped out
+ * pages or a fork, which was running right before we acquired
+ * mmap_sem, that marked all the anonymous memory readonly for
+ * cow.
+ *
+ * Modifying pi_state _before_ the user space value would
+ * leave the pi_state in an inconsistent state when we fault
+ * here, because we need to drop the hash bucket lock to
+ * handle the fault. This might be observed in the PID check
+ * in lookup_pi_state.
+ */
+retry:
+ if (get_futex_value_locked(&uval, uaddr))
+ goto handle_fault;
+
+ while (1) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (curval == -EFAULT)
+ goto handle_fault;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ }
+
+ pi_state->owner = newowner;
+
+ spin_lock_irq(&newowner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &newowner->pi_state_list);
+ spin_unlock_irq(&newowner->pi_lock);
+ return 0;
+
+ /*
+ * To handle the page fault we need to drop the hash bucket
+ * lock here. That gives the other task (either the pending
+ * owner itself or the task which stole the rtmutex) the
+ * chance to try the fixup of the pi_state. So once we are
+ * back from handling the fault we need to check the pi_state
+ * after reacquiring the hash bucket lock and before trying to
+ * do another fixup. When the fixup has been done already we
+ * simply return.
+ */
+handle_fault:
+ spin_unlock(q->lock_ptr);
+
+ ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+
+ spin_lock(q->lock_ptr);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return 0;
+
+ if (ret)
+ return ret;
+
+ goto retry;
+}
+
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'flags' shared capability
+ */
+#define FLAGS_SHARED 1
+
+static long futex_wait_restart(struct restart_block *restart);
+
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct task_struct *curr = current;
+ DECLARE_WAITQUEUE(wait, curr);
+ struct futex_hash_bucket *hb;
+ struct futex_q q;
+ u32 uval;
+ int ret;
+ struct hrtimer_sleeper t;
+ int rem = 0;
+
+ if (!bitset)
+ return -EINVAL;
+
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ retry:
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr, fshared, &q.key);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ hb = queue_lock(&q);
+
+ /*
+ * Access the page AFTER the futex is queued.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we queued after testing *uaddr, that would open
+ * a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * A consequence is that futex_wait() can return zero and absorb
+ * a wakeup when *uaddr != val on entry to the syscall. This is
+ * rare, but normal.
+ *
+ * for shared futexes, we hold the mmap semaphore, so the mapping
+ * cannot have changed since we looked it up in get_futex_key.
+ */
+ ret = get_futex_value_locked(&uval, uaddr);
+
+ if (unlikely(ret)) {
+ queue_unlock(&q, hb);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault it in and
+ * start all over again.
+ */
+ futex_unlock_mm(fshared);
+
+ ret = get_user(uval, uaddr);
+
+ if (!ret)
+ goto retry;
+ return ret;
+ }
+ ret = -EWOULDBLOCK;
+ if (uval != val)
+ goto out_unlock_release_sem;
+
+ /* Only actually queue if *uaddr contained val. */
+ queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ futex_unlock_mm(fshared);
+
+ /*
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
+ */
+
+ /* add_wait_queue is the barrier after __set_current_state. */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&q.waiters, &wait);
+ /*
+ * !plist_node_empty() is safe here without any lock.
+ * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+ */
+ if (likely(!plist_node_empty(&q.list))) {
+ if (!abs_time)
+ schedule();
+ else {
+ unsigned long slack;
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(&t, current);
+ hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
+
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ /*
+ * the timer could have already expired, in which
+ * case current would be flagged for rescheduling.
+ * Don't bother calling schedule.
+ */
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+
+ /* Flag if a timeout occured */
+ rem = (t.task == NULL);
+
+ destroy_hrtimer_on_stack(&t.timer);
+ }
+ }
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * NOTE: we don't remove ourselves from the waitqueue because
+ * we are the only user of it.
+ */
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!unqueue_me(&q))
+ return 0;
+ if (rem)
+ return -ETIMEDOUT;
+
+ /*
+ * We expect signal_pending(current), but another thread may
+ * have handled it for us already.
+ */
+ if (!abs_time)
+ return -ERESTARTSYS;
+ else {
+ struct restart_block *restart;
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->futex.uaddr = (u32 *)uaddr;
+ restart->futex.val = val;
+ restart->futex.time = abs_time->tv64;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = 0;
+
+ if (fshared)
+ restart->futex.flags |= FLAGS_SHARED;
+ return -ERESTART_RESTARTBLOCK;
+ }
+
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
+ out_release_sem:
+ futex_unlock_mm(fshared);
+ return ret;
+}
+
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+ struct rw_semaphore *fshared = NULL;
+ ktime_t t;
+
+ t.tv64 = restart->futex.time;
+ restart->fn = do_no_restart_syscall;
+ if (restart->futex.flags & FLAGS_SHARED)
+ fshared = &current->mm->mmap_sem;
+ return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+ restart->futex.bitset);
+}
+
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int detect, ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct task_struct *curr = current;
+ struct futex_hash_bucket *hb;
+ u32 uval, newval, curval;
+ struct futex_q q;
+ int ret, lock_taken, ownerdied = 0, attempt = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ if (time) {
+ to = &timeout;
+ hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires(&to->timer, *time);
+ }
+
+ q.pi_state = NULL;
+ retry:
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr, fshared, &q.key);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ retry_unlocked:
+ hb = queue_lock(&q);
+
+ retry_locked:
+ ret = lock_taken = 0;
+
+ /*
+ * To avoid races, we attempt to take the lock here again
+ * (by doing a 0 -> TID atomic cmpxchg), while holding all
+ * the locks. It will most likely not succeed.
+ */
+ newval = task_pid_vnr(current);
+
+ curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+
+ /*
+ * Detect deadlocks. In case of REQUEUE_PI this is a valid
+ * situation and we return success to user space.
+ */
+ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
+ ret = -EDEADLK;
+ goto out_unlock_release_sem;
+ }
+
+ /*
+ * Surprise - we got the lock. Just return to userspace:
+ */
+ if (unlikely(!curval))
+ goto out_unlock_release_sem;
+
+ uval = curval;
+
+ /*
+ * Set the WAITERS flag, so the owner will know it has someone
+ * to wake at next unlock
+ */
+ newval = curval | FUTEX_WAITERS;
+
+ /*
+ * There are two cases, where a futex might have no owner (the
+ * owner TID is 0): OWNER_DIED. We take over the futex in this
+ * case. We also do an unconditional take over, when the owner
+ * of the futex died.
+ *
+ * This is safe as we are protected by the hash bucket lock !
+ */
+ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+ /* Keep the OWNER_DIED bit */
+ newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
+ ownerdied = 0;
+ lock_taken = 1;
+ }
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+
+ /*
+ * We took the lock due to owner died take over.
+ */
+ if (unlikely(lock_taken))
+ goto out_unlock_release_sem;
+
+ /*
+ * We dont have the lock. Look up the PI state (or create it if
+ * we are the first waiter):
+ */
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
+
+ if (unlikely(ret)) {
+ switch (ret) {
+
+ case -EAGAIN:
+ /*
+ * Task is exiting and we just wait for the
+ * exit to complete.
+ */
+ queue_unlock(&q, hb);
+ futex_unlock_mm(fshared);
+ cond_resched();
+ goto retry;
+
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ goto uaddr_faulted;
+
+ /*
+ * We simply start over in case of a robust
+ * futex. The code above will take the futex
+ * and return happy.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ ownerdied = 1;
+ goto retry_locked;
+ }
+ default:
+ goto out_unlock_release_sem;
+ }
+ }
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ futex_unlock_mm(fshared);
+
+ WARN_ON(!q.pi_state);
+ /*
+ * Block on the PI mutex:
+ */
+ if (!trylock)
+ ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+ else {
+ ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ }
+
+ futex_lock_mm(fshared);
+ spin_lock(q.lock_ptr);
+
+ if (!ret) {
+ /*
+ * Got the lock. We might not be the anticipated owner
+ * if we did a lock-steal - fix up the PI-state in
+ * that case:
+ */
+ if (q.pi_state->owner != curr)
+ ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked the
+ * hash bucket.
+ */
+ if (q.pi_state->owner == curr) {
+ /*
+ * Try to get the rt_mutex now. This might
+ * fail as some other task acquired the
+ * rt_mutex after we removed ourself from the
+ * rt_mutex waiters list.
+ */
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ else {
+ /*
+ * pi_state is incorrect, some other
+ * task did a lock steal and we
+ * returned due to timeout or signal
+ * without taking the rt_mutex. Too
+ * late. We can access the
+ * rt_mutex_owner without locking, as
+ * the other task is now blocked on
+ * the hash bucket lock. Fix the state
+ * up.
+ */
+ struct task_struct *owner;
+ int res;
+
+ owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+ res = fixup_pi_state_owner(uaddr, &q, owner,
+ fshared);
+
+ /* propagate -EFAULT, if the fixup failed */
+ if (res)
+ ret = res;
+ }
+ } else {
+ /*
+ * Paranoia check. If we did not take the lock
+ * in the trylock above, then we should not be
+ * the owner of the rtmutex, neither the real
+ * nor the pending one:
+ */
+ if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+ printk(KERN_ERR "futex_lock_pi: ret = %d "
+ "pi-mutex: %p pi-state %p\n", ret,
+ q.pi_state->pi_mutex.owner,
+ q.pi_state->owner);
+ }
+ }
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ futex_unlock_mm(fshared);
+
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
+ out_release_sem:
+ futex_unlock_mm(fshared);
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret;
+
+ uaddr_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
+ */
+ queue_unlock(&q, hb);
+
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
+ goto out_release_sem;
+ goto retry_unlocked;
+ }
+
+ futex_unlock_mm(fshared);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ u32 uval;
+ struct plist_head *head;
+ union futex_key key;
+ int ret, attempt = 0;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ return -EPERM;
+ /*
+ * First take all the futex related locks:
+ */
+ futex_lock_mm(fshared);
+
+ ret = get_futex_key(uaddr, fshared, &key);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb = hash_futex(&key);
+retry_unlocked:
+ spin_lock(&hb->lock);
+
+ /*
+ * To avoid races, try to do the TID -> 0 atomic transition
+ * again. If it succeeds then we can return without waking
+ * anyone else up:
+ */
+ if (!(uval & FUTEX_OWNER_DIED))
+ uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
+
+
+ if (unlikely(uval == -EFAULT))
+ goto pi_faulted;
+ /*
+ * Rare case: we managed to release the lock atomically,
+ * no need to wake anyone else up:
+ */
+ if (unlikely(uval == task_pid_vnr(current)))
+ goto out_unlock;
+
+ /*
+ * Ok, other tasks may need to be woken up - check waiters
+ * and do the wakeup if necessary:
+ */
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (!match_futex (&this->key, &key))
+ continue;
+ ret = wake_futex_pi(uaddr, uval, this);
+ /*
+ * The atomic access to the futex value
+ * generated a pagefault, so retry the
+ * user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ goto out_unlock;
+ }
+ /*
+ * No waiters - kernel unlocks the futex:
+ */
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ }
+
+out_unlock:
+ spin_unlock(&hb->lock);
+out:
+ futex_unlock_mm(fshared);
+
+ return ret;
+
+pi_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ *
+ * ... and hb->lock. --ANK
+ */
+ spin_unlock(&hb->lock);
+
+ if (attempt++) {
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
+ goto out;
+ uval = 0;
+ goto retry_unlocked;
+ }
+
+ futex_unlock_mm(fshared);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
+ return ret;
+}
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head;
+ unsigned long ret;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (!pid)
+ head = current->robust_list;
+ else {
+ struct task_struct *p;
+
+ ret = -ESRCH;
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ ret = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_PTRACE))
+ goto err_unlock;
+ head = p->robust_list;
+ rcu_read_unlock();
+ }
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+{
+ u32 uval, nval, mval;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
+ if (nval == -EFAULT)
+ return -1;
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ unsigned long futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending)
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi))
+ return;
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending)
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ int ret = -ENOSYS;
+ int cmd = op & FUTEX_CMD_MASK;
+ struct rw_semaphore *fshared = NULL;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ fshared = &current->mm->mmap_sem;
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAIT_BITSET:
+ ret = futex_wait(uaddr, fshared, val, timeout, val3);
+ break;
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAKE_BITSET:
+ ret = futex_wake(uaddr, fshared, val, val3);
+ break;
+ case FUTEX_REQUEUE:
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+ break;
+ case FUTEX_CMP_REQUEUE:
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+ break;
+ case FUTEX_WAKE_OP:
+ ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+ break;
+ case FUTEX_LOCK_PI:
+ if (futex_cmpxchg_enabled)
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ break;
+ case FUTEX_UNLOCK_PI:
+ if (futex_cmpxchg_enabled)
+ ret = futex_unlock_pi(uaddr, fshared);
+ break;
+ case FUTEX_TRYLOCK_PI:
+ if (futex_cmpxchg_enabled)
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ return ret;
+}
+
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET)) {
+ if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ /*
+ * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+ * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
+ */
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_WAKE_OP)
+ val2 = (u32) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+
+static int __init futex_init(void)
+{
+ u32 curval;
+ int i;
+
+ /*
+ * This will fail and we want it. Some arch implementations do
+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
+ * functionality. We want to know that before we call in any
+ * of the complex code paths. Also we want to prevent
+ * registration of robust lists in that case. NULL is
+ * guaranteed to fault and we get -EFAULT on functional
+ * implementation, the non functional ones will return
+ * -ENOSYS.
+ */
+ curval = cmpxchg_futex_value_locked(NULL, 0, 0);
+ if (curval == -EFAULT)
+ futex_cmpxchg_enabled = 1;
+
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ return 0;
+}
+__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000..04ac3a9
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,195 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/nsproxy.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+ compat_size_t len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
+ compat_size_t __user *len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (!pid)
+ head = current->compat_robust_list;
+ else {
+ struct task_struct *p;
+
+ ret = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ ret = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_PTRACE))
+ goto err_unlock;
+ head = p->compat_robust_list;
+ read_unlock(&tasklist_lock);
+ }
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+ struct compat_timespec __user *utime, u32 __user *uaddr2,
+ u32 val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET)) {
+ if (get_compat_timespec(&ts, utime))
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
new file mode 100644
index 0000000..3016911
--- /dev/null
+++ b/kernel/hrtimer.c
@@ -0,0 +1,1924 @@
+/*
+ * linux/kernel/hrtimer.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * High-resolution kernel timers
+ *
+ * In contrast to the low-resolution timeout API implemented in
+ * kernel/timer.c, hrtimers provide finer resolution and accuracy
+ * depending on system configuration and capabilities.
+ *
+ * These timers are currently used for:
+ * - itimers
+ * - POSIX timers
+ * - nanosleep
+ * - precise in-kernel timing
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * Credits:
+ * based on kernel/timer.c
+ *
+ * Help, testing, suggestions, bugfixes, improvements were
+ * provided by:
+ *
+ * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
+ * et. al.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/cpu.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/tick.h>
+#include <linux/seq_file.h>
+#include <linux/err.h>
+#include <linux/debugobjects.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get(void)
+{
+ struct timespec now;
+
+ ktime_get_ts(&now);
+
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+
+ return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/*
+ * The timer bases:
+ *
+ * Note: If we want to add new timer bases, we have to skip the two
+ * clock ids captured by the cpu-timers. We do this by holding empty
+ * entries rather than doing math adjustment of the clock ids.
+ * This ensures that we capture erroneous accesses to these clock ids
+ * rather than moving them into the range of valid clock id's.
+ */
+DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
+{
+
+ .clock_base =
+ {
+ {
+ .index = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ .resolution = KTIME_LOW_RES,
+ },
+ }
+};
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ struct timespec tomono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ getnstimeofday(ts);
+ tomono = wall_to_monotonic;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+/*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
+{
+ ktime_t xtim, tomono;
+ struct timespec xts, tom;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ xts = current_kernel_time();
+ tom = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ xtim = timespec_to_ktime(xts);
+ tomono = timespec_to_ktime(tom);
+ base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+ base->clock_base[CLOCK_MONOTONIC].softirq_time =
+ ktime_add(xtim, tomono);
+}
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#ifdef CONFIG_SMP
+
+/*
+ * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static
+struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
+{
+ struct hrtimer_clock_base *base;
+
+ for (;;) {
+ base = timer->base;
+ if (likely(base != NULL)) {
+ spin_lock_irqsave(&base->cpu_base->lock, *flags);
+ if (likely(base == timer->base))
+ return base;
+ /* The timer has migrated to another CPU: */
+ spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+{
+ struct hrtimer_clock_base *new_base;
+ struct hrtimer_cpu_base *new_cpu_base;
+
+ new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ new_base = &new_cpu_base->clock_base[base->index];
+
+ if (base != new_base) {
+ /*
+ * We are trying to schedule the timer on the local CPU.
+ * However we can't change timer's base while it is running,
+ * so we keep it on the same CPU. No hassle vs. reprogramming
+ * the event source in the high resolution case. The softirq
+ * code will take care of this when the timer function has
+ * completed. There is no conflict as we hold the lock until
+ * the timer is enqueued.
+ */
+ if (unlikely(hrtimer_callback_running(timer)))
+ return base;
+
+ /* See the comment in lock_timer_base() */
+ timer->base = NULL;
+ spin_unlock(&base->cpu_base->lock);
+ spin_lock(&new_base->cpu_base->lock);
+ timer->base = new_base;
+ }
+ return new_base;
+}
+
+#else /* CONFIG_SMP */
+
+static inline struct hrtimer_clock_base *
+lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+ struct hrtimer_clock_base *base = timer->base;
+
+ spin_lock_irqsave(&base->cpu_base->lock, *flags);
+
+ return base;
+}
+
+# define switch_hrtimer_base(t, b) (b)
+
+#endif /* !CONFIG_SMP */
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if BITS_PER_LONG < 64
+# ifndef CONFIG_KTIME_SCALAR
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ * @kt: addend
+ * @nsec: the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
+{
+ ktime_t tmp;
+
+ if (likely(nsec < NSEC_PER_SEC)) {
+ tmp.tv64 = nsec;
+ } else {
+ unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+ tmp = ktime_set((long)nsec, rem);
+ }
+
+ return ktime_add(kt, tmp);
+}
+
+EXPORT_SYMBOL_GPL(ktime_add_ns);
+
+/**
+ * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
+ * @kt: minuend
+ * @nsec: the scalar nsec value to subtract
+ *
+ * Returns the subtraction of @nsec from @kt in ktime_t format
+ */
+ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
+{
+ ktime_t tmp;
+
+ if (likely(nsec < NSEC_PER_SEC)) {
+ tmp.tv64 = nsec;
+ } else {
+ unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+ tmp = ktime_set((long)nsec, rem);
+ }
+
+ return ktime_sub(kt, tmp);
+}
+
+EXPORT_SYMBOL_GPL(ktime_sub_ns);
+# endif /* !CONFIG_KTIME_SCALAR */
+
+/*
+ * Divide a ktime value by a nanosecond value
+ */
+u64 ktime_divns(const ktime_t kt, s64 div)
+{
+ u64 dclc;
+ int sft = 0;
+
+ dclc = ktime_to_ns(kt);
+ /* Make sure the divisor is less than 2^32: */
+ while (div >> 32) {
+ sft++;
+ div >>= 1;
+ }
+ dclc >>= sft;
+ do_div(dclc, (unsigned long) div);
+
+ return dclc;
+}
+#endif /* BITS_PER_LONG >= 64 */
+
+/*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+ ktime_t res = ktime_add(lhs, rhs);
+
+ /*
+ * We use KTIME_SEC_MAX here, the maximum timeout which we can
+ * return to user space in a timespec:
+ */
+ if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ res = ktime_set(KTIME_SEC_MAX, 0);
+
+ return res;
+}
+
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr hrtimer_debug_descr;
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ hrtimer_cancel(timer);
+ debug_object_init(timer, &hrtimer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ WARN_ON_ONCE(1);
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ WARN_ON(1);
+
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ hrtimer_cancel(timer);
+ debug_object_free(timer, &hrtimer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static struct debug_obj_descr hrtimer_debug_descr = {
+ .name = "hrtimer",
+ .fixup_init = hrtimer_fixup_init,
+ .fixup_activate = hrtimer_fixup_activate,
+ .fixup_free = hrtimer_fixup_free,
+};
+
+static inline void debug_hrtimer_init(struct hrtimer *timer)
+{
+ debug_object_init(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_activate(struct hrtimer *timer)
+{
+ debug_object_activate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
+{
+ debug_object_deactivate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_free(struct hrtimer *timer)
+{
+ debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode);
+
+void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+ __hrtimer_init(timer, clock_id, mode);
+}
+
+void destroy_hrtimer_on_stack(struct hrtimer *timer)
+{
+ debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+#else
+static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+#endif
+
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+ return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+ list_del_init(&timer->cb_entry);
+}
+
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer enabled ?
+ */
+static int hrtimer_hres_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ if (!strcmp(str, "off"))
+ hrtimer_hres_enabled = 0;
+ else if (!strcmp(str, "on"))
+ hrtimer_hres_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("highres=", setup_hrtimer_hres);
+
+/*
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
+ */
+static inline int hrtimer_is_hres_enabled(void)
+{
+ return hrtimer_hres_enabled;
+}
+
+/*
+ * Is the high resolution mode active ?
+ */
+static inline int hrtimer_hres_active(void)
+{
+ return __get_cpu_var(hrtimer_bases).hres_active;
+}
+
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
+{
+ int i;
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires;
+
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct hrtimer *timer;
+
+ if (!base->first)
+ continue;
+ timer = rb_entry(base->first, struct hrtimer, node);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ if (expires.tv64 < cpu_base->expires_next.tv64)
+ cpu_base->expires_next = expires;
+ }
+
+ if (cpu_base->expires_next.tv64 != KTIME_MAX)
+ tick_program_event(cpu_base->expires_next, 1);
+}
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ int res;
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * When the callback is running, we do not reprogram the clock event
+ * device. The timer callback is either running on a different CPU or
+ * the callback is executed in the hrtimer_interrupt context. The
+ * reprogramming is handled either by the softirq, which called the
+ * callback or at the end of the hrtimer_interrupt.
+ */
+ if (hrtimer_callback_running(timer))
+ return 0;
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Nothing wrong
+ * about that, just avoid to call into the tick code, which
+ * has now objections against negative expiry values.
+ */
+ if (expires.tv64 < 0)
+ return -ETIME;
+
+ if (expires.tv64 >= expires_next->tv64)
+ return 0;
+
+ /*
+ * Clockevents returns -ETIME, when the event was in the past.
+ */
+ res = tick_program_event(expires, 0);
+ if (!IS_ERR_VALUE(res))
+ *expires_next = expires;
+ return res;
+}
+
+
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base;
+ struct timespec realtime_offset;
+ unsigned long seq;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ set_normalized_timespec(&realtime_offset,
+ -wall_to_monotonic.tv_sec,
+ -wall_to_monotonic.tv_nsec);
+ } while (read_seqretry(&xtime_lock, seq));
+
+ base = &__get_cpu_var(hrtimer_bases);
+
+ /* Adjust CLOCK_REALTIME offset */
+ spin_lock(&base->lock);
+ base->clock_base[CLOCK_REALTIME].offset =
+ timespec_to_ktime(realtime_offset);
+
+ hrtimer_force_reprogram(base);
+ spin_unlock(&base->lock);
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 1);
+}
+
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hres_timers_resume(void)
+{
+ /* Retrigger the CPU local events: */
+ retrigger_next_event(NULL);
+}
+
+/*
+ * Initialize the high resolution related parts of cpu_base
+ */
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+{
+ base->expires_next.tv64 = KTIME_MAX;
+ base->hres_active = 0;
+}
+
+/*
+ * Initialize the high resolution related parts of a hrtimer
+ */
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+}
+
+/*
+ * When High resolution timers are active, try to reprogram. Note, that in case
+ * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
+ * check happens. The timer gets enqueued into the rbtree. The reprogramming
+ * and expiry check is done in the hrtimer_interrupt or in the softirq.
+ */
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+
+ /* Timer is expired, act upon the callback mode */
+ switch(timer->cb_mode) {
+ case HRTIMER_CB_IRQSAFE_PERCPU:
+ case HRTIMER_CB_IRQSAFE_UNLOCKED:
+ /*
+ * This is solely for the sched tick emulation with
+ * dynamic tick support to ensure that we do not
+ * restart the tick right on the edge and end up with
+ * the tick timer in the softirq ! The calling site
+ * takes care of this. Also used for hrtimer sleeper !
+ */
+ debug_hrtimer_deactivate(timer);
+ return 1;
+ case HRTIMER_CB_SOFTIRQ:
+ /*
+ * Move everything else into the softirq pending list !
+ */
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ timer->state = HRTIMER_STATE_PENDING;
+ return 1;
+ default:
+ BUG();
+ }
+ }
+ return 0;
+}
+
+/*
+ * Switch to high resolution mode
+ */
+static int hrtimer_switch_to_hres(void)
+{
+ int cpu = smp_processor_id();
+ struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
+ unsigned long flags;
+
+ if (base->hres_active)
+ return 1;
+
+ local_irq_save(flags);
+
+ if (tick_init_highres()) {
+ local_irq_restore(flags);
+ printk(KERN_WARNING "Could not switch to high resolution "
+ "mode on CPU %d\n", cpu);
+ return 0;
+ }
+ base->hres_active = 1;
+ base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
+ base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+
+ tick_setup_sched_timer();
+
+ /* "Retrigger" the interrupt to get things going */
+ retrigger_next_event(NULL);
+ local_irq_restore(flags);
+ printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
+ smp_processor_id());
+ return 1;
+}
+
+static inline void hrtimer_raise_softirq(void)
+{
+ raise_softirq(HRTIMER_SOFTIRQ);
+}
+
+#else
+
+static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ return 0;
+}
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ return 0;
+}
+static inline void hrtimer_raise_softirq(void) { }
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
+{
+ if (timer->start_site)
+ return;
+
+ timer->start_site = addr;
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+}
+#endif
+
+/*
+ * Counterpart to lock_hrtimer_base above:
+ */
+static inline
+void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+ spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
+}
+
+/**
+ * hrtimer_forward - forward the timer expiry
+ * @timer: hrtimer to forward
+ * @now: forward past this time
+ * @interval: the interval to forward
+ *
+ * Forward the timer expiry so it will expire in the future.
+ * Returns the number of overruns.
+ */
+u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
+{
+ u64 orun = 1;
+ ktime_t delta;
+
+ delta = ktime_sub(now, hrtimer_get_expires(timer));
+
+ if (delta.tv64 < 0)
+ return 0;
+
+ if (interval.tv64 < timer->base->resolution.tv64)
+ interval.tv64 = timer->base->resolution.tv64;
+
+ if (unlikely(delta.tv64 >= interval.tv64)) {
+ s64 incr = ktime_to_ns(interval);
+
+ orun = ktime_divns(delta, incr);
+ hrtimer_add_expires_ns(timer, incr * orun);
+ if (hrtimer_get_expires_tv64(timer) > now.tv64)
+ return orun;
+ /*
+ * This (and the ktime_add() below) is the
+ * correction for exact:
+ */
+ orun++;
+ }
+ hrtimer_add_expires(timer, interval);
+
+ return orun;
+}
+EXPORT_SYMBOL_GPL(hrtimer_forward);
+
+/*
+ * enqueue_hrtimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ */
+static void enqueue_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base, int reprogram)
+{
+ struct rb_node **link = &base->active.rb_node;
+ struct rb_node *parent = NULL;
+ struct hrtimer *entry;
+ int leftmost = 1;
+
+ debug_hrtimer_activate(timer);
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct hrtimer, node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same expiry time stay together.
+ */
+ if (hrtimer_get_expires_tv64(timer) <
+ hrtimer_get_expires_tv64(entry)) {
+ link = &(*link)->rb_left;
+ } else {
+ link = &(*link)->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Insert the timer to the rbtree and check whether it
+ * replaces the first pending timer
+ */
+ if (leftmost) {
+ /*
+ * Reprogram the clock event device. When the timer is already
+ * expired hrtimer_enqueue_reprogram has either called the
+ * callback or added it to the pending list and raised the
+ * softirq.
+ *
+ * This is a NOP for !HIGHRES
+ */
+ if (reprogram && hrtimer_enqueue_reprogram(timer, base))
+ return;
+
+ base->first = &timer->node;
+ }
+
+ rb_link_node(&timer->node, parent, link);
+ rb_insert_color(&timer->node, &base->active);
+ /*
+ * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
+ * state of a possibly running callback.
+ */
+ timer->state |= HRTIMER_STATE_ENQUEUED;
+}
+
+/*
+ * __remove_hrtimer - internal function to remove a timer
+ *
+ * Caller must hold the base lock.
+ *
+ * High resolution timer mode reprograms the clock event device when the
+ * timer is the one which expires next. The caller can disable this by setting
+ * reprogram to zero. This is useful, when the context does a reprogramming
+ * anyway (e.g. timer interrupt)
+ */
+static void __remove_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base,
+ unsigned long newstate, int reprogram)
+{
+ /* High res. callback list. NOP for !HIGHRES */
+ if (hrtimer_cb_pending(timer))
+ hrtimer_remove_cb_pending(timer);
+ else {
+ /*
+ * Remove the timer from the rbtree and replace the
+ * first entry pointer if necessary.
+ */
+ if (base->first == &timer->node) {
+ base->first = rb_next(&timer->node);
+ /* Reprogram the clock event device. if enabled */
+ if (reprogram && hrtimer_hres_active())
+ hrtimer_force_reprogram(base->cpu_base);
+ }
+ rb_erase(&timer->node, &base->active);
+ }
+ timer->state = newstate;
+}
+
+/*
+ * remove hrtimer, called with base lock held
+ */
+static inline int
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+{
+ if (hrtimer_is_queued(timer)) {
+ int reprogram;
+
+ /*
+ * Remove the timer and force reprogramming when high
+ * resolution mode is active and the timer is on the current
+ * CPU. If we remove a timer on another CPU, reprogramming is
+ * skipped. The interrupt event on this CPU is fired and
+ * reprogramming happens in the interrupt handler. This is a
+ * rare case and less expensive than a smp call.
+ */
+ debug_hrtimer_deactivate(timer);
+ timer_stats_hrtimer_clear_start_info(timer);
+ reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
+ reprogram);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+ const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base, *new_base;
+ unsigned long flags;
+ int ret, raise;
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ /* Remove an active timer from the queue: */
+ ret = remove_hrtimer(timer, base);
+
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base);
+
+ if (mode == HRTIMER_MODE_REL) {
+ tim = ktime_add_safe(tim, new_base->get_time());
+ /*
+ * CONFIG_TIME_LOW_RES is a temporary way for architectures
+ * to signal that they simply return xtime in
+ * do_gettimeoffset(). In this case we want to round up by
+ * resolution when starting a relative timer, to avoid short
+ * timeouts. This will go away with the GTOD framework.
+ */
+#ifdef CONFIG_TIME_LOW_RES
+ tim = ktime_add_safe(tim, base->resolution);
+#endif
+ }
+
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+
+ timer_stats_hrtimer_set_start_info(timer);
+
+ /*
+ * Only allow reprogramming if the new base is on this CPU.
+ * (it might still be on another CPU if the timer was pending)
+ */
+ enqueue_hrtimer(timer, new_base,
+ new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
+
+ /*
+ * The timer may be expired and moved to the cb_pending
+ * list. We can not raise the softirq with base lock held due
+ * to a possible deadlock with runqueue lock.
+ */
+ raise = timer->state == HRTIMER_STATE_PENDING;
+
+ /*
+ * We use preempt_disable to prevent this task from migrating after
+ * setting up the softirq and raising it. Otherwise, if me migrate
+ * we will raise the softirq on the wrong CPU.
+ */
+ preempt_disable();
+
+ unlock_hrtimer_base(timer, &flags);
+
+ if (raise)
+ hrtimer_raise_softirq();
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start);
+
+
+/**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+ * @timer: hrtimer to stop
+ *
+ * Returns:
+ * 0 when the timer was not active
+ * 1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ * cannot be stopped
+ */
+int hrtimer_try_to_cancel(struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+ int ret = -1;
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ if (!hrtimer_callback_running(timer))
+ ret = remove_hrtimer(timer, base);
+
+ unlock_hrtimer_base(timer, &flags);
+
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
+
+/**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+ * @timer: the timer to be cancelled
+ *
+ * Returns:
+ * 0 when the timer was not active
+ * 1 when the timer was active
+ */
+int hrtimer_cancel(struct hrtimer *timer)
+{
+ for (;;) {
+ int ret = hrtimer_try_to_cancel(timer);
+
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
+
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ * @timer: the timer to read
+ */
+ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+ ktime_t rem;
+
+ base = lock_hrtimer_base(timer, &flags);
+ rem = hrtimer_expires_remaining(timer);
+ unlock_hrtimer_base(timer, &flags);
+
+ return rem;
+}
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+
+#ifdef CONFIG_NO_HZ
+/**
+ * hrtimer_get_next_event - get the time until next expiry event
+ *
+ * Returns the delta to the next expiry event or KTIME_MAX if no timer
+ * is pending.
+ */
+ktime_t hrtimer_get_next_event(void)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (!hrtimer_hres_active()) {
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct hrtimer *timer;
+
+ if (!base->first)
+ continue;
+
+ timer = rb_entry(base->first, struct hrtimer, node);
+ delta.tv64 = hrtimer_get_expires_tv64(timer);
+ delta = ktime_sub(delta, base->get_time());
+ if (delta.tv64 < mindelta.tv64)
+ mindelta.tv64 = delta.tv64;
+ }
+ }
+
+ spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+ if (mindelta.tv64 < 0)
+ mindelta.tv64 = 0;
+ return mindelta;
+}
+#endif
+
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ struct hrtimer_cpu_base *cpu_base;
+
+ memset(timer, 0, sizeof(struct hrtimer));
+
+ cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+
+ if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ clock_id = CLOCK_MONOTONIC;
+
+ timer->base = &cpu_base->clock_base[clock_id];
+ INIT_LIST_HEAD(&timer->cb_entry);
+ hrtimer_init_timer_hres(timer);
+
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
+}
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ * @timer: the timer to be initialized
+ * @clock_id: the clock to be used
+ * @mode: timer mode abs/rel
+ */
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_hrtimer_init(timer);
+ __hrtimer_init(timer, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init);
+
+/**
+ * hrtimer_get_res - get the timer resolution for a clock
+ * @which_clock: which clock to query
+ * @tp: pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of the clock selected by @which_clock in the
+ * variable pointed to by @tp.
+ */
+int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+{
+ struct hrtimer_cpu_base *cpu_base;
+
+ cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
+
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+ spin_lock_irq(&cpu_base->lock);
+
+ while (!list_empty(&cpu_base->cb_pending)) {
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ struct hrtimer *timer;
+ int restart;
+ int emulate_hardirq_ctx = 0;
+
+ timer = list_entry(cpu_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ debug_hrtimer_deactivate(timer);
+ timer_stats_account_hrtimer(timer);
+
+ fn = timer->function;
+ /*
+ * A timer might have been added to the cb_pending list
+ * when it was migrated during a cpu-offline operation.
+ * Emulate hardirq context for such timers.
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+ emulate_hardirq_ctx = 1;
+
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+ spin_unlock_irq(&cpu_base->lock);
+
+ if (unlikely(emulate_hardirq_ctx)) {
+ local_irq_disable();
+ restart = fn(timer);
+ local_irq_enable();
+ } else
+ restart = fn(timer);
+
+ spin_lock_irq(&cpu_base->lock);
+
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+ if (restart == HRTIMER_RESTART) {
+ BUG_ON(hrtimer_active(timer));
+ /*
+ * Enqueue the timer, allow reprogramming of the event
+ * device
+ */
+ enqueue_hrtimer(timer, timer->base, 1);
+ } else if (hrtimer_active(timer)) {
+ /*
+ * If the timer was rearmed on another CPU, reprogram
+ * the event device.
+ */
+ struct hrtimer_clock_base *base = timer->base;
+
+ if (base->first == &timer->node &&
+ hrtimer_reprogram(timer, base)) {
+ /*
+ * Timer is expired. Thus move it from tree to
+ * pending list again.
+ */
+ __remove_hrtimer(timer, base,
+ HRTIMER_STATE_PENDING, 0);
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ }
+ }
+ }
+ spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base = timer->base;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ int restart;
+
+ debug_hrtimer_deactivate(timer);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ timer_stats_account_hrtimer(timer);
+
+ fn = timer->function;
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
+ /*
+ * Used for scheduler timers, avoid lock inversion with
+ * rq->lock and tasklist_lock.
+ *
+ * These timers are required to deal with enqueue expiry
+ * themselves and are not allowed to migrate.
+ */
+ spin_unlock(&cpu_base->lock);
+ restart = fn(timer);
+ spin_lock(&cpu_base->lock);
+ } else
+ restart = fn(timer);
+
+ /*
+ * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+ * reprogramming of the event hardware. This happens at the end of this
+ * function anyway.
+ */
+ if (restart != HRTIMER_NORESTART) {
+ BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ enqueue_hrtimer(timer, base, 0);
+ }
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base;
+ ktime_t expires_next, now;
+ int i, raise = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ retry:
+ now = ktime_get();
+
+ expires_next.tv64 = KTIME_MAX;
+
+ base = cpu_base->clock_base;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ ktime_t basenow;
+ struct rb_node *node;
+
+ spin_lock(&cpu_base->lock);
+
+ basenow = ktime_add(now, base->offset);
+
+ while ((node = base->first)) {
+ struct hrtimer *timer;
+
+ timer = rb_entry(node, struct hrtimer, node);
+
+ /*
+ * The immediate goal for using the softexpires is
+ * minimizing wakeups, not running timers at the
+ * earliest interrupt after their soft expiration.
+ * This allows us to avoid using a Priority Search
+ * Tree, which can answer a stabbing querry for
+ * overlapping intervals and instead use the simple
+ * BST we already have.
+ * We don't add extra wakeups by delaying timers that
+ * are right-of a not yet expired timer, because that
+ * timer will have to trigger a wakeup anyway.
+ */
+
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
+ ktime_t expires;
+
+ expires = ktime_sub(hrtimer_get_expires(timer),
+ base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ break;
+ }
+
+ /* Move softirq callbacks to the pending list */
+ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+ __remove_hrtimer(timer, base,
+ HRTIMER_STATE_PENDING, 0);
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ raise = 1;
+ continue;
+ }
+
+ __run_hrtimer(timer);
+ }
+ spin_unlock(&cpu_base->lock);
+ base++;
+ }
+
+ cpu_base->expires_next = expires_next;
+
+ /* Reprogramming necessary ? */
+ if (expires_next.tv64 != KTIME_MAX) {
+ if (tick_program_event(expires_next, 0))
+ goto retry;
+ }
+
+ /* Raise softirq ? */
+ if (raise)
+ raise_softirq(HRTIMER_SOFTIRQ);
+}
+
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+ struct tick_device *td;
+ unsigned long flags;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ local_irq_save(flags);
+ td = &__get_cpu_var(tick_cpu_device);
+ if (td && td->evtdev)
+ hrtimer_interrupt(td->evtdev);
+ local_irq_restore(flags);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+ run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ if (hrtimer_hres_active())
+ return;
+
+ /*
+ * This _is_ ugly: We have to check in the softirq context,
+ * whether we can switch to highres and / or nohz mode. The
+ * clocksource switch happens in the timer interrupt with
+ * xtime_lock held. Notification from there only sets the
+ * check bit in the tick_oneshot code, otherwise we might
+ * deadlock vs. xtime_lock.
+ */
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ hrtimer_switch_to_hres();
+
+ run_hrtimer_pending(cpu_base);
+}
+
+/*
+ * Called from hardirq context every jiffy
+ */
+void hrtimer_run_queues(void)
+{
+ struct rb_node *node;
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base;
+ int index, gettime = 1;
+
+ if (hrtimer_hres_active())
+ return;
+
+ for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+ base = &cpu_base->clock_base[index];
+
+ if (!base->first)
+ continue;
+
+ if (gettime) {
+ hrtimer_get_softirq_time(cpu_base);
+ gettime = 0;
+ }
+
+ spin_lock(&cpu_base->lock);
+
+ while ((node = base->first)) {
+ struct hrtimer *timer;
+
+ timer = rb_entry(node, struct hrtimer, node);
+ if (base->softirq_time.tv64 <=
+ hrtimer_get_expires_tv64(timer))
+ break;
+
+ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+ __remove_hrtimer(timer, base,
+ HRTIMER_STATE_PENDING, 0);
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ continue;
+ }
+
+ __run_hrtimer(timer);
+ }
+ spin_unlock(&cpu_base->lock);
+ }
+}
+
+/*
+ * Sleep related functions:
+ */
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
+{
+ struct hrtimer_sleeper *t =
+ container_of(timer, struct hrtimer_sleeper, timer);
+ struct task_struct *task = t->task;
+
+ t->task = NULL;
+ if (task)
+ wake_up_process(task);
+
+ return HRTIMER_NORESTART;
+}
+
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+{
+ sl->timer.function = hrtimer_wakeup;
+ sl->task = task;
+#ifdef CONFIG_HIGH_RES_TIMERS
+ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+#endif
+}
+
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+{
+ hrtimer_init_sleeper(t, current);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start_expires(&t->timer, mode);
+ if (!hrtimer_active(&t->timer))
+ t->task = NULL;
+
+ if (likely(t->task))
+ schedule();
+
+ hrtimer_cancel(&t->timer);
+ mode = HRTIMER_MODE_ABS;
+
+ } while (t->task && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+
+ return t->task == NULL;
+}
+
+static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = hrtimer_expires_remaining(timer);
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+}
+
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
+{
+ struct hrtimer_sleeper t;
+ struct timespec __user *rmtp;
+ int ret = 0;
+
+ hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+ HRTIMER_MODE_ABS);
+ hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+
+ if (do_nanosleep(&t, HRTIMER_MODE_ABS))
+ goto out;
+
+ rmtp = restart->nanosleep.rmtp;
+ if (rmtp) {
+ ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* The other values in restart are already filled in */
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+}
+
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ const enum hrtimer_mode mode, const clockid_t clockid)
+{
+ struct restart_block *restart;
+ struct hrtimer_sleeper t;
+ int ret = 0;
+ unsigned long slack;
+
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
+
+ hrtimer_init_on_stack(&t.timer, clockid, mode);
+ hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+ if (do_nanosleep(&t, mode))
+ goto out;
+
+ /* Absolute timers do not update the rmtp value and restart: */
+ if (mode == HRTIMER_MODE_ABS) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+
+ if (rmtp) {
+ ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ restart = &current_thread_info()->restart_block;
+ restart->fn = hrtimer_nanosleep_restart;
+ restart->nanosleep.index = t.timer.base->index;
+ restart->nanosleep.rmtp = rmtp;
+ restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+}
+
+SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
+ struct timespec __user *, rmtp)
+{
+ struct timespec tu;
+
+ if (copy_from_user(&tu, rqtp, sizeof(tu)))
+ return -EFAULT;
+
+ if (!timespec_valid(&tu))
+ return -EINVAL;
+
+ return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+}
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void __cpuinit init_hrtimers_cpu(int cpu)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ spin_lock_init(&cpu_base->lock);
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ cpu_base->clock_base[i].cpu_base = cpu_base;
+
+ INIT_LIST_HEAD(&cpu_base->cb_pending);
+ hrtimer_init_hres(cpu_base);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base, int dcpu)
+{
+ struct hrtimer *timer;
+ struct rb_node *node;
+ int raise = 0;
+
+ while ((node = rb_first(&old_base->active))) {
+ timer = rb_entry(node, struct hrtimer, node);
+ BUG_ON(hrtimer_callback_running(timer));
+ debug_hrtimer_deactivate(timer);
+
+ /*
+ * Should not happen. Per CPU timers should be
+ * canceled _before_ the migration code is called
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+ __remove_hrtimer(timer, old_base,
+ HRTIMER_STATE_INACTIVE, 0);
+ WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+ timer, timer->function, dcpu);
+ continue;
+ }
+
+ /*
+ * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * timer could be seen as !active and just vanish away
+ * under us on another CPU
+ */
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ timer->base = new_base;
+ /*
+ * Enqueue the timer. Allow reprogramming of the event device
+ */
+ enqueue_hrtimer(timer, new_base, 1);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /*
+ * Happens with high res enabled when the timer was
+ * already expired and the callback mode is
+ * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+ * enqueue code does not move them to the soft irq
+ * pending list for performance/latency reasons, but
+ * in the migration state, we need to do that
+ * otherwise we end up with a stale timer.
+ */
+ if (timer->state == HRTIMER_STATE_MIGRATE) {
+ timer->state = HRTIMER_STATE_PENDING;
+ list_add_tail(&timer->cb_entry,
+ &new_base->cpu_base->cb_pending);
+ raise = 1;
+ }
+#endif
+ /* Clear the migration state bit */
+ timer->state &= ~HRTIMER_STATE_MIGRATE;
+ }
+ return raise;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+{
+ struct hrtimer *timer;
+ int raise = 0;
+
+ while (!list_empty(&old_base->cb_pending)) {
+ timer = list_entry(old_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+ timer->base = &new_base->clock_base[timer->base->index];
+ list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+ raise = 1;
+ }
+ return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+{
+ return 0;
+}
+#endif
+
+static void migrate_hrtimers(int cpu)
+{
+ struct hrtimer_cpu_base *old_base, *new_base;
+ int i, raise = 0;
+
+ BUG_ON(cpu_online(cpu));
+ old_base = &per_cpu(hrtimer_bases, cpu);
+ new_base = &get_cpu_var(hrtimer_bases);
+
+ tick_cancel_sched_timer(cpu);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ if (migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i], cpu))
+ raise = 1;
+ }
+
+ if (migrate_hrtimer_pending(old_base, new_base))
+ raise = 1;
+
+ spin_unlock(&old_base->lock);
+ spin_unlock_irq(&new_base->lock);
+ put_cpu_var(hrtimer_bases);
+
+ if (raise)
+ hrtimer_raise_softirq();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action) {
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ init_hrtimers_cpu(cpu);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
+ migrate_hrtimers(cpu);
+ break;
+#endif
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hrtimers_nb = {
+ .notifier_call = hrtimer_cpu_notify,
+};
+
+void __init hrtimers_init(void)
+{
+ hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+ register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+ open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
+#endif
+}
+
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && !expires->tv64) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "inifinte"
+ */
+ if (!expires) {
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, mode);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 0000000..681c52d
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,5 @@
+
+obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 0000000..cc0f732
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,191 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+
+#include "internals.h"
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DEFINE_MUTEX(probing_active);
+
+/**
+ * probe_irq_on - begin an interrupt autodetect
+ *
+ * Commence probing for an interrupt. The interrupts are scanned
+ * and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+ struct irq_desc *desc;
+ unsigned long mask = 0;
+ unsigned int status;
+ int i;
+
+ mutex_lock(&probing_active);
+ /*
+ * something may have generated an irq long ago and we want to
+ * flush such a longstanding irq before considering it as spurious.
+ */
+ for_each_irq_desc_reverse(i, desc) {
+ spin_lock_irq(&desc->lock);
+ if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+ /*
+ * An old-style architecture might still have
+ * the handle_bad_irq handler there:
+ */
+ compat_irq_chip_set_default_handler(desc);
+
+ /*
+ * Some chips need to know about probing in
+ * progress:
+ */
+ if (desc->chip->set_type)
+ desc->chip->set_type(i, IRQ_TYPE_PROBE);
+ desc->chip->startup(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /* Wait for longstanding interrupts to trigger. */
+ msleep(20);
+
+ /*
+ * enable any unassigned irqs
+ * (we must startup again here because if a longstanding irq
+ * happened in the previous stage, it may have masked itself)
+ */
+ for_each_irq_desc_reverse(i, desc) {
+ spin_lock_irq(&desc->lock);
+ if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+ desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+ if (desc->chip->startup(i))
+ desc->status |= IRQ_PENDING;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /*
+ * Wait for spurious interrupts to trigger
+ */
+ msleep(100);
+
+ /*
+ * Now filter out any obviously spurious interrupts
+ */
+ for_each_irq_desc(i, desc) {
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ /* It triggered already - consider it spurious. */
+ if (!(status & IRQ_WAITING)) {
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->chip->shutdown(i);
+ } else
+ if (i < 32)
+ mask |= 1 << i;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ return mask;
+}
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ * probe_irq_mask - scan a bitmap of interrupt lines
+ * @val: mask of interrupts to consider
+ *
+ * Scan the interrupt lines and return a bitmap of active
+ * autodetect interrupts. The interrupt probe logic state
+ * is then returned to its previous value.
+ *
+ * Note: we need to scan all the irq's even though we will
+ * only return autodetect irq numbers - just so that we reset
+ * them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+ unsigned int status, mask = 0;
+ struct irq_desc *desc;
+ int i;
+
+ for_each_irq_desc(i, desc) {
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (i < 16 && !(status & IRQ_WAITING))
+ mask |= 1 << i;
+
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->chip->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ mutex_unlock(&probing_active);
+
+ return mask & val;
+}
+EXPORT_SYMBOL(probe_irq_mask);
+
+/**
+ * probe_irq_off - end an interrupt autodetect
+ * @val: mask of potential interrupts (unused)
+ *
+ * Scans the unused interrupt lines and returns the line which
+ * appears to have triggered the interrupt. If no interrupt was
+ * found then zero is returned. If more than one interrupt is
+ * found then minus the first candidate is returned to indicate
+ * their is doubt.
+ *
+ * The interrupt probe logic state is returned to its previous
+ * value.
+ *
+ * BUGS: When used in a module (which arguably shouldn't happen)
+ * nothing prevents two IRQ probe callers from overlapping. The
+ * results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+ int i, irq_found = 0, nr_of_irqs = 0;
+ struct irq_desc *desc;
+ unsigned int status;
+
+ for_each_irq_desc(i, desc) {
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (!(status & IRQ_WAITING)) {
+ if (!nr_of_irqs)
+ irq_found = i;
+ nr_of_irqs++;
+ }
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->chip->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ mutex_unlock(&probing_active);
+
+ if (nr_of_irqs > 1)
+ irq_found = -irq_found;
+
+ return irq_found;
+}
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 0000000..10b5092
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,631 @@
+/*
+ * linux/kernel/irq/chip.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code, for irq-chip
+ * based architectures.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ */
+
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ * dynamic_irq_init - initialize a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_init(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+ return;
+ }
+
+ /* Ensure we don't have left over values from a previous use of this irq */
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status = IRQ_DISABLED;
+ desc->chip = &no_irq_chip;
+ desc->handle_irq = handle_bad_irq;
+ desc->depth = 1;
+ desc->msi_desc = NULL;
+ desc->handler_data = NULL;
+ desc->chip_data = NULL;
+ desc->action = NULL;
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+#ifdef CONFIG_SMP
+ cpus_setall(desc->affinity);
+#endif
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+ return;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (desc->action) {
+ spin_unlock_irqrestore(&desc->lock, flags);
+ WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+ irq);
+ return;
+ }
+ desc->msi_desc = NULL;
+ desc->handler_data = NULL;
+ desc->chip_data = NULL;
+ desc->handle_irq = handle_bad_irq;
+ desc->chip = &no_irq_chip;
+ desc->name = NULL;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+
+/**
+ * set_irq_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ if (!chip)
+ chip = &no_irq_chip;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ irq_chip_set_defaults(chip);
+ desc->chip = chip;
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ * set_irq_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ */
+int set_irq_type(unsigned int irq, unsigned int type)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret = -ENXIO;
+
+ if (!desc) {
+ printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+ return -ENODEV;
+ }
+
+ if (type == IRQ_TYPE_NONE)
+ return 0;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = __irq_set_trigger(desc, irq, type);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(set_irq_type);
+
+/**
+ * set_irq_data - set irq type data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
+ *
+ * Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR
+ "Trying to install controller data for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->handler_data = data;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ * set_irq_data - set irq type data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the hardware irq controller data for an irq
+ */
+int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR
+ "Trying to install msi data for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->msi_desc = entry;
+ if (entry)
+ entry->irq = irq;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+
+/**
+ * set_irq_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
+ *
+ * Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR
+ "Trying to install chip data for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ if (!desc->chip) {
+ printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->chip_data = data;
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ desc->chip->unmask(irq);
+ desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+}
+
+/*
+ * default startup function
+ */
+static unsigned int default_startup(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ desc->chip->enable(irq);
+ return 0;
+}
+
+/*
+ * default shutdown function
+ */
+static void default_shutdown(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ desc->chip->mask(irq);
+ desc->status |= IRQ_MASKED;
+}
+
+/*
+ * Fixup enable/disable function pointers
+ */
+void irq_chip_set_defaults(struct irq_chip *chip)
+{
+ if (!chip->enable)
+ chip->enable = default_enable;
+ if (!chip->disable)
+ chip->disable = default_disable;
+ if (!chip->startup)
+ chip->startup = default_startup;
+ /*
+ * We use chip->disable, when the user provided its own. When
+ * we have default_disable set for chip->disable, then we need
+ * to use default_shutdown, otherwise the irq line is not
+ * disabled on free_irq():
+ */
+ if (!chip->shutdown)
+ chip->shutdown = chip->disable != default_disable ?
+ chip->disable : default_shutdown;
+ if (!chip->name)
+ chip->name = chip->typename;
+ if (!chip->end)
+ chip->end = dummy_irq_chip.end;
+}
+
+static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+{
+ if (desc->chip->mask_ack)
+ desc->chip->mask_ack(irq);
+ else {
+ desc->chip->mask(irq);
+ desc->chip->ack(irq);
+ }
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ */
+void
+handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ spin_lock(&desc->lock);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out_unlock;
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask
+ * it after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ */
+void
+handle_level_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ spin_lock(&desc->lock);
+ mask_ack_irq(desc, irq);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out_unlock;
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /*
+ * If its disabled or no action available
+ * keep it masked and get out of here
+ */
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+ desc->chip->unmask(irq);
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ spin_lock(&desc->lock);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out;
+
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+ desc->status |= IRQ_PENDING;
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
+ goto out;
+ }
+
+ desc->status |= IRQ_INPROGRESS;
+ desc->status &= ~IRQ_PENDING;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+out:
+ desc->chip->eoi(irq);
+
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the assosiacted event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left.
+ */
+void
+handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+{
+ spin_lock(&desc->lock);
+
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+ /*
+ * If we're currently running this IRQ, or its disabled,
+ * we shouldn't process the IRQ. Mark it pending, handle
+ * the necessary masking and go out
+ */
+ if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+ !desc->action)) {
+ desc->status |= (IRQ_PENDING | IRQ_MASKED);
+ mask_ack_irq(desc, irq);
+ goto out_unlock;
+ }
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /* Start handling the irq */
+ desc->chip->ack(irq);
+
+ /* Mark the IRQ currently in progress.*/
+ desc->status |= IRQ_INPROGRESS;
+
+ do {
+ struct irqaction *action = desc->action;
+ irqreturn_t action_ret;
+
+ if (unlikely(!action)) {
+ desc->chip->mask(irq);
+ goto out_unlock;
+ }
+
+ /*
+ * When another irq arrived while we were handling
+ * one, we could have masked the irq.
+ * Renable it, if it was not disabled in meantime.
+ */
+ if (unlikely((desc->status &
+ (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+ (IRQ_PENDING | IRQ_MASKED))) {
+ desc->chip->unmask(irq);
+ desc->status &= ~IRQ_MASKED;
+ }
+
+ desc->status &= ~IRQ_PENDING;
+ spin_unlock(&desc->lock);
+ action_ret = handle_IRQ_event(irq, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ spin_lock(&desc->lock);
+
+ } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+
+ desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_percpu_IRQ - Per CPU local irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+{
+ irqreturn_t action_ret;
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
+
+ action_ret = handle_IRQ_event(irq, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ if (desc->chip->eoi)
+ desc->chip->eoi(irq);
+}
+
+void
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR
+ "Trying to install type control for IRQ%d\n", irq);
+ return;
+ }
+
+ if (!handle)
+ handle = handle_bad_irq;
+ else if (desc->chip == &no_irq_chip) {
+ printk(KERN_WARNING "Trying to install %sinterrupt handler "
+ "for IRQ%d\n", is_chained ? "chained " : "", irq);
+ /*
+ * Some ARM implementations install a handler for really dumb
+ * interrupt hardware without setting an irq_chip. This worked
+ * with the ARM no_irq_chip but the check in setup_irq would
+ * prevent us to setup the interrupt at all. Switch it to
+ * dummy_irq_chip for easy transition.
+ */
+ desc->chip = &dummy_irq_chip;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+
+ /* Uninstall? */
+ if (handle == handle_bad_irq) {
+ if (desc->chip != &no_irq_chip)
+ mask_ack_irq(desc, irq);
+ desc->status |= IRQ_DISABLED;
+ desc->depth = 1;
+ }
+ desc->handle_irq = handle;
+ desc->name = name;
+
+ if (handle != handle_bad_irq && is_chained) {
+ desc->status &= ~IRQ_DISABLED;
+ desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+ desc->depth = 0;
+ desc->chip->startup(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+ irq_flow_handler_t handle)
+{
+ set_irq_chip(irq, chip);
+ __set_irq_handler(irq, handle, 0, NULL);
+}
+
+void
+set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+ irq_flow_handler_t handle, const char *name)
+{
+ set_irq_chip(irq, chip);
+ __set_irq_handler(irq, handle, 0, name);
+}
+
+void __init set_irq_noprobe(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+ return;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status |= IRQ_NOPROBE;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void __init set_irq_probe(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc) {
+ printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+ return;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status &= ~IRQ_NOPROBE;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
new file mode 100644
index 0000000..38a25b8
--- /dev/null
+++ b/kernel/irq/devres.c
@@ -0,0 +1,90 @@
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+
+/*
+ * Device resource management aware IRQ request/free implementation.
+ */
+struct irq_devres {
+ unsigned int irq;
+ void *dev_id;
+};
+
+static void devm_irq_release(struct device *dev, void *res)
+{
+ struct irq_devres *this = res;
+
+ free_irq(this->irq, this->dev_id);
+}
+
+static int devm_irq_match(struct device *dev, void *res, void *data)
+{
+ struct irq_devres *this = res, *match = data;
+
+ return this->irq == match->irq && this->dev_id == match->dev_id;
+}
+
+/**
+ * devm_request_irq - allocate an interrupt line for a managed device
+ * @dev: device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as
+ * request_irq(). IRQs requested with this function will be
+ * automatically freed on driver detach.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, dev_free_irq() must be used.
+ */
+int devm_request_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ struct irq_devres *dr;
+ int rc;
+
+ dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rc = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (rc) {
+ devres_free(dr);
+ return rc;
+ }
+
+ dr->irq = irq;
+ dr->dev_id = dev_id;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_irq);
+
+/**
+ * devm_free_irq - free an interrupt
+ * @dev: device to free interrupt for
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as free_irq().
+ * This function instead of free_irq() should be used to manually
+ * free IRQs allocated with dev_request_irq().
+ */
+void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
+{
+ struct irq_devres match_data = { irq, dev_id };
+
+ free_irq(irq, dev_id);
+ WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
+ &match_data));
+}
+EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 0000000..c815b42
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,277 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq: the interrupt number
+ * @desc: description of the interrupt
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
+ */
+void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+{
+ print_irq_desc(irq, desc);
+ kstat_incr_irqs_this_cpu(irq, desc);
+ ack_bad_irq(irq);
+}
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the appropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+ [0 ... NR_IRQS-1] = {
+ .status = IRQ_DISABLED,
+ .chip = &no_irq_chip,
+ .handle_irq = handle_bad_irq,
+ .depth = 1,
+ .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+#ifdef CONFIG_SMP
+ .affinity = CPU_MASK_ALL
+#endif
+ }
+};
+
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ print_irq_desc(irq, desc);
+ ack_bad_irq(irq);
+}
+
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+ return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+ .name = "none",
+ .startup = noop_ret,
+ .shutdown = noop,
+ .enable = noop,
+ .disable = noop,
+ .ack = ack_bad,
+ .end = noop,
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+ .name = "dummy",
+ .startup = noop_ret,
+ .shutdown = noop,
+ .enable = noop,
+ .disable = noop,
+ .ack = noop,
+ .mask = noop,
+ .unmask = noop,
+ .end = noop,
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id)
+{
+ return IRQ_NONE;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq: the interrupt number
+ * @action: the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
+ */
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+{
+ irqreturn_t ret, retval = IRQ_NONE;
+ unsigned int status = 0;
+
+ if (!(action->flags & IRQF_DISABLED))
+ local_irq_enable_in_hardirq();
+
+ do {
+ ret = action->handler(irq, action->dev_id);
+ if (ret == IRQ_HANDLED)
+ status |= action->flags;
+ retval |= ret;
+ action = action->next;
+ } while (action);
+
+ if (status & IRQF_SAMPLE_RANDOM)
+ add_interrupt_randomness(irq);
+ local_irq_disable();
+
+ return retval;
+}
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+/**
+ * __do_IRQ - original all in one highlevel IRQ handler
+ * @irq: the interrupt number
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ *
+ * This is the original x86 implementation which is used for every
+ * interrupt type.
+ */
+unsigned int __do_IRQ(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned int status;
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (CHECK_IRQ_PER_CPU(desc->status)) {
+ irqreturn_t action_ret;
+
+ /*
+ * No locking required for CPU-local interrupts:
+ */
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
+ if (likely(!(desc->status & IRQ_DISABLED))) {
+ action_ret = handle_IRQ_event(irq, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ }
+ desc->chip->end(irq);
+ return 1;
+ }
+
+ spin_lock(&desc->lock);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
+ /*
+ * REPLAY is when Linux resends an IRQ that was dropped earlier
+ * WAITING is used by probe to mark irqs that are being tested
+ */
+ status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+ status |= IRQ_PENDING; /* we _want_ to handle it */
+
+ /*
+ * If the IRQ is disabled for whatever reason, we cannot
+ * use the action we have.
+ */
+ action = NULL;
+ if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+ action = desc->action;
+ status &= ~IRQ_PENDING; /* we commit to handling */
+ status |= IRQ_INPROGRESS; /* we are handling it */
+ }
+ desc->status = status;
+
+ /*
+ * If there is no IRQ handler or it was disabled, exit early.
+ * Since we set PENDING, if another processor is handling
+ * a different instance of this same irq, the other processor
+ * will take care of it.
+ */
+ if (unlikely(!action))
+ goto out;
+
+ /*
+ * Edge triggered interrupts need to remember
+ * pending events.
+ * This applies to any hw interrupts that allow a second
+ * instance of the same irq to arrive while we are in do_IRQ
+ * or in the handler. But the code here only handles the _second_
+ * instance of the irq, not the third or fourth. So it is mostly
+ * useful for irq hardware that does not mask cleanly in an
+ * SMP environment.
+ */
+ for (;;) {
+ irqreturn_t action_ret;
+
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock(&desc->lock);
+ if (likely(!(desc->status & IRQ_PENDING)))
+ break;
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+
+out:
+ /*
+ * The ->end() handler has to deal with interrupts which got
+ * disabled while the handler was running.
+ */
+ desc->chip->end(irq);
+ spin_unlock(&desc->lock);
+
+ return 1;
+}
+#endif
+
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+void early_init_irq_lock_class(void)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for_each_irq_desc(i, desc)
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+}
+#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 0000000..64c1c72
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,69 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+/* Set default functions for irq_chip structures: */
+extern void irq_chip_set_defaults(struct irq_chip *chip);
+
+/* Set default handler: */
+extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+ unsigned long flags);
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void register_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+#endif
+
+extern int irq_select_affinity_usr(unsigned int irq);
+
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+ printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+ irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+ printk("->handle_irq(): %p, ", desc->handle_irq);
+ print_symbol("%s\n", (unsigned long)desc->handle_irq);
+ printk("->chip(): %p, ", desc->chip);
+ print_symbol("%s\n", (unsigned long)desc->chip);
+ printk("->action(): %p\n", desc->action);
+ if (desc->action) {
+ printk("->action->handler(): %p, ", desc->action->handler);
+ print_symbol("%s\n", (unsigned long)desc->action->handler);
+ }
+
+ P(IRQ_INPROGRESS);
+ P(IRQ_DISABLED);
+ P(IRQ_PENDING);
+ P(IRQ_REPLAY);
+ P(IRQ_AUTODETECT);
+ P(IRQ_WAITING);
+ P(IRQ_LEVEL);
+ P(IRQ_MASKED);
+#ifdef CONFIG_IRQ_PER_CPU
+ P(IRQ_PER_CPU);
+#endif
+ P(IRQ_NOPROBE);
+ P(IRQ_NOREQUEST);
+ P(IRQ_NOAUTOEN);
+}
+
+#undef P
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 0000000..801addd
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,736 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006 Thomas Gleixner
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+cpumask_t irq_default_affinity = CPU_MASK_ALL;
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned int status;
+
+ if (!desc)
+ return;
+
+ do {
+ unsigned long flags;
+
+ /*
+ * Wait until we're out of the critical section. This might
+ * give the wrong answer due to the lack of memory barriers.
+ */
+ while (desc->status & IRQ_INPROGRESS)
+ cpu_relax();
+
+ /* Ok, that indicated we're done: double-check carefully. */
+ spin_lock_irqsave(&desc->lock, flags);
+ status = desc->status;
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ /* Oops, that failed? */
+ } while (status & IRQ_INPROGRESS);
+}
+EXPORT_SYMBOL(synchronize_irq);
+
+/**
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
+ *
+ */
+int irq_can_set_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+ !desc->chip->set_affinity)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc->chip->set_affinity)
+ return -EINVAL;
+
+ spin_lock_irqsave(&desc->lock, flags);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
+ desc->affinity = cpumask;
+ desc->chip->set_affinity(irq, cpumask);
+ } else {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = cpumask;
+ }
+#else
+ desc->affinity = cpumask;
+ desc->chip->set_affinity(irq, cpumask);
+#endif
+ desc->status |= IRQ_AFFINITY_SET;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+/*
+ * Generic version of the affinity autoselector.
+ */
+int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
+{
+ cpumask_t mask;
+
+ if (!irq_can_set_affinity(irq))
+ return 0;
+
+ cpus_and(mask, cpu_online_map, irq_default_affinity);
+
+ /*
+ * Preserve an userspace affinity setup, but make sure that
+ * one of the targets is online.
+ */
+ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+ if (cpus_intersects(desc->affinity, cpu_online_map))
+ mask = desc->affinity;
+ else
+ desc->status &= ~IRQ_AFFINITY_SET;
+ }
+
+ desc->affinity = mask;
+ desc->chip->set_affinity(irq, mask);
+
+ return 0;
+}
+#else
+static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+{
+ return irq_select_affinity(irq);
+}
+#endif
+
+/*
+ * Called when affinity is set via /proc/irq
+ */
+int irq_select_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = do_irq_select_affinity(irq, desc);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return ret;
+}
+
+#else
+static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+{
+ return 0;
+}
+#endif
+
+/**
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ *
+ * This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (!desc->depth++) {
+ desc->status |= IRQ_DISABLED;
+ desc->chip->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return;
+
+ disable_irq_nosync(irq);
+ if (desc->action)
+ synchronize_irq(irq);
+}
+EXPORT_SYMBOL(disable_irq);
+
+static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
+ switch (desc->depth) {
+ case 0:
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ break;
+ case 1: {
+ unsigned int status = desc->status & ~IRQ_DISABLED;
+
+ /* Prevent probing on this irq: */
+ desc->status = status | IRQ_NOPROBE;
+ check_irq_resend(desc, irq);
+ /* fall-through */
+ }
+ default:
+ desc->depth--;
+ }
+}
+
+/**
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
+ *
+ * Undoes the effect of one call to disable_irq(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ *
+ * This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ __enable_irq(desc, irq);
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL(enable_irq);
+
+static int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int ret = -ENXIO;
+
+ if (desc->chip->set_wake)
+ ret = desc->chip->set_wake(irq, on);
+
+ return ret;
+}
+
+/**
+ * set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is
+ * disabled by default. Enables and disables must match,
+ * just as they match for non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep
+ * states like "suspend to RAM".
+ */
+int set_irq_wake(unsigned int irq, unsigned int on)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret = 0;
+
+ /* wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ spin_lock_irqsave(&desc->lock, flags);
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ desc->status |= IRQ_WAKEUP;
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ desc->status &= ~IRQ_WAKEUP;
+ }
+ }
+
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(set_irq_wake);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+
+ if (!desc)
+ return 0;
+
+ if (desc->status & IRQ_NOREQUEST)
+ return 0;
+
+ action = desc->action;
+ if (action)
+ if (irqflags & action->flags & IRQF_SHARED)
+ action = NULL;
+
+ return !action;
+}
+
+void compat_irq_chip_set_default_handler(struct irq_desc *desc)
+{
+ /*
+ * If the architecture still has not overriden
+ * the flow handler then zap the default. This
+ * should catch incorrect flow-type setting.
+ */
+ if (desc->handle_irq == &handle_bad_irq)
+ desc->handle_irq = NULL;
+}
+
+int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+ unsigned long flags)
+{
+ int ret;
+ struct irq_chip *chip = desc->chip;
+
+ if (!chip || !chip->set_type) {
+ /*
+ * IRQF_TRIGGER_* but the PIC does not support multiple
+ * flow-types?
+ */
+ pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ chip ? (chip->name ? : "unknown") : "unknown");
+ return 0;
+ }
+
+ ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+
+ if (ret)
+ pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+ (int)(flags & IRQF_TRIGGER_MASK),
+ irq, chip->set_type);
+ else {
+ /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
+ desc->status &= ~IRQ_TYPE_SENSE_MASK;
+ desc->status |= flags & IRQ_TYPE_SENSE_MASK;
+ }
+
+ return ret;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+static int
+__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
+{
+ struct irqaction *old, **p;
+ const char *old_name = NULL;
+ unsigned long flags;
+ int shared = 0;
+ int ret;
+
+ if (!desc)
+ return -EINVAL;
+
+ if (desc->chip == &no_irq_chip)
+ return -ENOSYS;
+ /*
+ * Some drivers like serial.c use request_irq() heavily,
+ * so we have to be careful not to interfere with a
+ * running system.
+ */
+ if (new->flags & IRQF_SAMPLE_RANDOM) {
+ /*
+ * This function might sleep, we want to call it first,
+ * outside of the atomic block.
+ * Yes, this might clear the entropy pool if the wrong
+ * driver is attempted to be loaded, without actually
+ * installing a new handler, but is this really a problem,
+ * only the sysadmin is able to do this.
+ */
+ rand_initialize_irq(irq);
+ }
+
+ /*
+ * The following block of code has to be executed atomically
+ */
+ spin_lock_irqsave(&desc->lock, flags);
+ p = &desc->action;
+ old = *p;
+ if (old) {
+ /*
+ * Can't share interrupts unless both agree to and are
+ * the same type (level, edge, polarity). So both flag
+ * fields must have IRQF_SHARED set and the bits which
+ * set the trigger type must match.
+ */
+ if (!((old->flags & new->flags) & IRQF_SHARED) ||
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+ old_name = old->name;
+ goto mismatch;
+ }
+
+#if defined(CONFIG_IRQ_PER_CPU)
+ /* All handlers must agree on per-cpuness */
+ if ((old->flags & IRQF_PERCPU) !=
+ (new->flags & IRQF_PERCPU))
+ goto mismatch;
+#endif
+
+ /* add new interrupt at end of irq queue */
+ do {
+ p = &old->next;
+ old = *p;
+ } while (old);
+ shared = 1;
+ }
+
+ if (!shared) {
+ irq_chip_set_defaults(desc->chip);
+
+ /* Setup the type (level, edge polarity) if configured: */
+ if (new->flags & IRQF_TRIGGER_MASK) {
+ ret = __irq_set_trigger(desc, irq, new->flags);
+
+ if (ret) {
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+ }
+ } else
+ compat_irq_chip_set_default_handler(desc);
+#if defined(CONFIG_IRQ_PER_CPU)
+ if (new->flags & IRQF_PERCPU)
+ desc->status |= IRQ_PER_CPU;
+#endif
+
+ desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+ IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+
+ if (!(desc->status & IRQ_NOAUTOEN)) {
+ desc->depth = 0;
+ desc->status &= ~IRQ_DISABLED;
+ desc->chip->startup(irq);
+ } else
+ /* Undo nested disables: */
+ desc->depth = 1;
+
+ /* Exclude IRQ from balancing if requested */
+ if (new->flags & IRQF_NOBALANCING)
+ desc->status |= IRQ_NO_BALANCING;
+
+ /* Set default affinity mask once everything is setup */
+ do_irq_select_affinity(irq, desc);
+
+ } else if ((new->flags & IRQF_TRIGGER_MASK)
+ && (new->flags & IRQF_TRIGGER_MASK)
+ != (desc->status & IRQ_TYPE_SENSE_MASK)) {
+ /* hope the handler works with the actual trigger mode... */
+ pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
+ irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
+ (int)(new->flags & IRQF_TRIGGER_MASK));
+ }
+
+ *p = new;
+
+ /* Reset broken irq detection when installing new handler */
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+
+ /*
+ * Check whether we disabled the irq via the spurious handler
+ * before. Reenable it and give it another chance.
+ */
+ if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+ desc->status &= ~IRQ_SPURIOUS_DISABLED;
+ __enable_irq(desc, irq);
+ }
+
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ new->irq = irq;
+ register_irq_proc(irq, desc);
+ new->dir = NULL;
+ register_handler_proc(irq, new);
+
+ return 0;
+
+mismatch:
+#ifdef CONFIG_DEBUG_SHIRQ
+ if (!(new->flags & IRQF_PROBE_SHARED)) {
+ printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+ if (old_name)
+ printk(KERN_ERR "current handler: %s\n", old_name);
+ dump_stack();
+ }
+#endif
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return -EBUSY;
+}
+
+/**
+ * setup_irq - setup an interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
+ *
+ * Used to statically setup interrupts in the early boot process.
+ */
+int setup_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return __setup_irq(irq, desc, act);
+}
+
+/**
+ * free_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction **p;
+ unsigned long flags;
+
+ WARN_ON(in_interrupt());
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ p = &desc->action;
+ for (;;) {
+ struct irqaction *action = *p;
+
+ if (action) {
+ struct irqaction **pp = p;
+
+ p = &action->next;
+ if (action->dev_id != dev_id)
+ continue;
+
+ /* Found it - now remove it from the list of entries */
+ *pp = action->next;
+
+ /* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
+ if (desc->chip->release)
+ desc->chip->release(irq, dev_id);
+#endif
+
+ if (!desc->action) {
+ desc->status |= IRQ_DISABLED;
+ if (desc->chip->shutdown)
+ desc->chip->shutdown(irq);
+ else
+ desc->chip->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU */
+ synchronize_irq(irq);
+#ifdef CONFIG_DEBUG_SHIRQ
+ /*
+ * It's a shared IRQ -- the driver ought to be
+ * prepared for it to happen even now it's
+ * being freed, so let's make sure.... We do
+ * this after actually deregistering it, to
+ * make sure that a 'real' IRQ doesn't run in
+ * parallel with our fake
+ */
+ if (action->flags & IRQF_SHARED) {
+ local_irq_save(flags);
+ action->handler(irq, dev_id);
+ local_irq_restore(flags);
+ }
+#endif
+ kfree(action);
+ return;
+ }
+ printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+#ifdef CONFIG_DEBUG_SHIRQ
+ dump_stack();
+#endif
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return;
+ }
+}
+EXPORT_SYMBOL(free_irq);
+
+/**
+ * request_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. From the point this
+ * call is made your handler function may be invoked. Since
+ * your handler function must clear any interrupt the board
+ * raises, you must take care both to initialise your hardware
+ * and to set up the interrupt handler in the right order.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id
+ * as this is required when freeing the interrupt.
+ *
+ * Flags:
+ *
+ * IRQF_SHARED Interrupt is shared
+ * IRQF_DISABLED Disable local interrupts while processing
+ * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
+ * IRQF_TRIGGER_* Specify active edge(s) or level
+ *
+ */
+int request_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ int retval;
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * Lockdep wants atomic interrupt handlers:
+ */
+ irqflags |= IRQF_DISABLED;
+#endif
+ /*
+ * Sanity-check: shared interrupts must pass in a real dev-ID,
+ * otherwise we'll have trouble later trying to figure out
+ * which interrupt is which (messes up the interrupt freeing
+ * logic etc).
+ */
+ if ((irqflags & IRQF_SHARED) && !dev_id)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return -EINVAL;
+
+ if (desc->status & IRQ_NOREQUEST)
+ return -EINVAL;
+ if (!handler)
+ return -EINVAL;
+
+ action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags;
+ cpus_clear(action->mask);
+ action->name = devname;
+ action->next = NULL;
+ action->dev_id = dev_id;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ kfree(action);
+
+#ifdef CONFIG_DEBUG_SHIRQ
+ if (irqflags & IRQF_SHARED) {
+ /*
+ * It's a shared IRQ -- the driver ought to be prepared for it
+ * to happen immediately, so let's make sure....
+ * We disable the irq to make sure that a 'real' IRQ doesn't
+ * run in parallel with our fake.
+ */
+ unsigned long flags;
+
+ disable_irq(irq);
+ local_irq_save(flags);
+
+ handler(irq, dev_id);
+
+ local_irq_restore(flags);
+ enable_irq(irq);
+ }
+#endif
+ return retval;
+}
+EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 0000000..9db681d
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,64 @@
+
+#include <linux/irq.h>
+
+void move_masked_irq(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ cpumask_t tmp;
+
+ if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ return;
+
+ /*
+ * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+ */
+ if (CHECK_IRQ_PER_CPU(desc->status)) {
+ WARN_ON(1);
+ return;
+ }
+
+ desc->status &= ~IRQ_MOVE_PENDING;
+
+ if (unlikely(cpus_empty(desc->pending_mask)))
+ return;
+
+ if (!desc->chip->set_affinity)
+ return;
+
+ assert_spin_locked(&desc->lock);
+
+ cpus_and(tmp, desc->pending_mask, cpu_online_map);
+
+ /*
+ * If there was a valid mask to work with, please
+ * do the disable, re-program, enable sequence.
+ * This is *not* particularly important for level triggered
+ * but in a edge trigger case, we might be setting rte
+ * when an active trigger is comming in. This could
+ * cause some ioapics to mal-function.
+ * Being paranoid i guess!
+ *
+ * For correct operation this depends on the caller
+ * masking the irqs.
+ */
+ if (likely(!cpus_empty(tmp))) {
+ desc->chip->set_affinity(irq,tmp);
+ }
+ cpus_clear(desc->pending_mask);
+}
+
+void move_native_irq(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ return;
+
+ if (unlikely(desc->status & IRQ_DISABLED))
+ return;
+
+ desc->chip->mask(irq);
+ move_masked_irq(irq);
+ desc->chip->unmask(irq);
+}
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 0000000..d257e7d
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,249 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+static struct proc_dir_entry *root_irq_dir;
+
+#ifdef CONFIG_SMP
+
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ cpumask_t *mask = &desc->affinity;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (desc->status & IRQ_MOVE_PENDING)
+ mask = &desc->pending_mask;
+#endif
+ seq_cpumask(m, mask);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+#ifndef is_affinity_mask_valid
+#define is_affinity_mask_valid(val) 1
+#endif
+
+int no_irq_affinity;
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ cpumask_t new_value;
+ int err;
+
+ if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+ irq_balancing_disabled(irq))
+ return -EIO;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ return err;
+
+ if (!is_affinity_mask_valid(new_value))
+ return -EINVAL;
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ if (!cpus_intersects(new_value, cpu_online_map))
+ /* Special case for empty set - allow the architecture
+ code to set default SMP affinity. */
+ return irq_select_affinity_usr(irq) ? -EINVAL : count;
+
+ irq_set_affinity(irq, new_value);
+
+ return count;
+}
+
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations irq_affinity_proc_fops = {
+ .open = irq_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_proc_write,
+};
+
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+ seq_cpumask(m, &irq_default_affinity);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static ssize_t default_affinity_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ cpumask_t new_value;
+ int err;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ return err;
+
+ if (!is_affinity_mask_valid(new_value))
+ return -EINVAL;
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ if (!cpus_intersects(new_value, cpu_online_map))
+ return -EINVAL;
+
+ irq_default_affinity = new_value;
+
+ return count;
+}
+
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, default_affinity_show, NULL);
+}
+
+static const struct file_operations default_affinity_proc_fops = {
+ .open = default_affinity_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = default_affinity_write,
+};
+#endif
+
+static int irq_spurious_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct irq_desc *desc = irq_to_desc((long) data);
+ return sprintf(page, "count %u\n"
+ "unhandled %u\n"
+ "last_unhandled %u ms\n",
+ desc->irq_count,
+ desc->irqs_unhandled,
+ jiffies_to_msecs(desc->last_unhandled));
+}
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action ; action; action = action->next) {
+ if ((action != new_action) && action->name &&
+ !strcmp(new_action->name, action->name)) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ char name [MAX_NAMELEN];
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc->dir || action->dir || !action->name ||
+ !name_unique(irq, action))
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+ /* create /proc/irq/1234/handler/ */
+ action->dir = proc_mkdir(name, desc->dir);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+ char name [MAX_NAMELEN];
+ struct proc_dir_entry *entry;
+
+ if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%d", irq);
+
+ /* create /proc/irq/1234 */
+ desc->dir = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+ /* create /proc/irq/<irq>/smp_affinity */
+ proc_create_data("smp_affinity", 0600, desc->dir,
+ &irq_affinity_proc_fops, (void *)(long)irq);
+#endif
+
+ entry = create_proc_entry("spurious", 0444, desc->dir);
+ if (entry) {
+ entry->data = (void *)(long)irq;
+ entry->read_proc = irq_spurious_read;
+ }
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ if (action->dir) {
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ remove_proc_entry(action->dir->name, desc->dir);
+ }
+}
+
+static void register_default_affinity_proc(void)
+{
+#ifdef CONFIG_SMP
+ proc_create("irq/default_smp_affinity", 0600, NULL,
+ &default_affinity_proc_fops);
+#endif
+}
+
+void init_irq_proc(void)
+{
+ unsigned int irq;
+ struct irq_desc *desc;
+
+ /* create /proc/irq */
+ root_irq_dir = proc_mkdir("irq", NULL);
+ if (!root_irq_dir)
+ return;
+
+ register_default_affinity_proc();
+
+ /*
+ * Create entries for all existing IRQs.
+ */
+ for_each_irq_desc(irq, desc)
+ register_irq_proc(irq, desc);
+}
+
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 0000000..89c7117
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,82 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner
+ *
+ * This file contains the IRQ-resend code
+ *
+ * If the interrupt is waiting to be processed, we try to re-run it.
+ * We can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at the hardware level, so in those cases
+ * we allow the resending of IRQs via a tasklet.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ while (!bitmap_empty(irqs_resend, nr_irqs)) {
+ irq = find_first_bit(irqs_resend, nr_irqs);
+ clear_bit(irq, irqs_resend);
+ desc = irq_to_desc(irq);
+ local_irq_disable();
+ desc->handle_irq(irq, desc);
+ local_irq_enable();
+ }
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+#endif
+
+/*
+ * IRQ resend
+ *
+ * Is called with interrupts disabled and desc->lock held.
+ */
+void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+{
+ unsigned int status = desc->status;
+
+ /*
+ * Make sure the interrupt is enabled, before resending it:
+ */
+ desc->chip->enable(irq);
+
+ /*
+ * We do not resend level type interrupts. Level type
+ * interrupts are resent by hardware when they are still
+ * active.
+ */
+ if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+ desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+
+ if (!desc->chip || !desc->chip->retrigger ||
+ !desc->chip->retrigger(irq)) {
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+ /* Set it pending and activate the softirq: */
+ set_bit(irq, irqs_resend);
+ tasklet_schedule(&resend_tasklet);
+#endif
+ }
+ }
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 0000000..dd364c1
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,300 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/moduleparam.h>
+#include <linux/timer.h>
+
+static int irqfixup __read_mostly;
+
+#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
+static void poll_spurious_irqs(unsigned long dummy);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+
+/*
+ * Recovery handler for misrouted interrupts.
+ */
+static int try_one_irq(int irq, struct irq_desc *desc)
+{
+ struct irqaction *action;
+ int ok = 0, work = 0;
+
+ spin_lock(&desc->lock);
+ /* Already running on another processor */
+ if (desc->status & IRQ_INPROGRESS) {
+ /*
+ * Already running: If it is shared get the other
+ * CPU to go looking for our mystery interrupt too
+ */
+ if (desc->action && (desc->action->flags & IRQF_SHARED))
+ desc->status |= IRQ_PENDING;
+ spin_unlock(&desc->lock);
+ return ok;
+ }
+ /* Honour the normal IRQ locking */
+ desc->status |= IRQ_INPROGRESS;
+ action = desc->action;
+ spin_unlock(&desc->lock);
+
+ while (action) {
+ /* Only shared IRQ handlers are safe to call */
+ if (action->flags & IRQF_SHARED) {
+ if (action->handler(irq, action->dev_id) ==
+ IRQ_HANDLED)
+ ok = 1;
+ }
+ action = action->next;
+ }
+ local_irq_disable();
+ /* Now clean up the flags */
+ spin_lock(&desc->lock);
+ action = desc->action;
+
+ /*
+ * While we were looking for a fixup someone queued a real
+ * IRQ clashing with our walk:
+ */
+ while ((desc->status & IRQ_PENDING) && action) {
+ /*
+ * Perform real IRQ processing for the IRQ we deferred
+ */
+ work = 1;
+ spin_unlock(&desc->lock);
+ handle_IRQ_event(irq, action);
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+ /*
+ * If we did actual work for the real IRQ line we must let the
+ * IRQ controller clean up too
+ */
+ if (work && desc->chip && desc->chip->end)
+ desc->chip->end(irq);
+ spin_unlock(&desc->lock);
+
+ return ok;
+}
+
+static int misrouted_irq(int irq)
+{
+ struct irq_desc *desc;
+ int i, ok = 0;
+
+ for_each_irq_desc(i, desc) {
+ if (!i)
+ continue;
+
+ if (i == irq) /* Already tried */
+ continue;
+
+ if (try_one_irq(i, desc))
+ ok = 1;
+ }
+ /* So the caller can adjust the irq error counts */
+ return ok;
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for_each_irq_desc(i, desc) {
+ unsigned int status;
+
+ if (!i)
+ continue;
+
+ /* Racy but it doesn't matter */
+ status = desc->status;
+ barrier();
+ if (!(status & IRQ_SPURIOUS_DISABLED))
+ continue;
+
+ try_one_irq(i, desc);
+ }
+
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+}
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ * functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ struct irqaction *action;
+
+ if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+ printk(KERN_ERR "irq event %d: bogus return value %x\n",
+ irq, action_ret);
+ } else {
+ printk(KERN_ERR "irq %d: nobody cared (try booting with "
+ "the \"irqpoll\" option)\n", irq);
+ }
+ dump_stack();
+ printk(KERN_ERR "handlers:\n");
+
+ action = desc->action;
+ while (action) {
+ printk(KERN_ERR "[<%p>]", action->handler);
+ print_symbol(" (%s)",
+ (unsigned long)action->handler);
+ printk("\n");
+ action = action->next;
+ }
+}
+
+static void
+report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+{
+ static int count = 100;
+
+ if (count > 0) {
+ count--;
+ __report_bad_irq(irq, desc, action_ret);
+ }
+}
+
+static inline int
+try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ struct irqaction *action;
+
+ if (!irqfixup)
+ return 0;
+
+ /* We didn't actually handle the IRQ - see if it was misrouted? */
+ if (action_ret == IRQ_NONE)
+ return 1;
+
+ /*
+ * But for 'irqfixup == 2' we also do it for handled interrupts if
+ * they are marked as IRQF_IRQPOLL (or for irq zero, which is the
+ * traditional PC timer interrupt.. Legacy)
+ */
+ if (irqfixup < 2)
+ return 0;
+
+ if (!irq)
+ return 1;
+
+ /*
+ * Since we don't get the descriptor lock, "action" can
+ * change under us. We don't really care, but we don't
+ * want to follow a NULL pointer. So tell the compiler to
+ * just load it once by using a barrier.
+ */
+ action = desc->action;
+ barrier();
+ return action && (action->flags & IRQF_IRQPOLL);
+}
+
+void note_interrupt(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ if (unlikely(action_ret != IRQ_HANDLED)) {
+ /*
+ * If we are seeing only the odd spurious IRQ caused by
+ * bus asynchronicity then don't eventually trigger an error,
+ * otherwise the couter becomes a doomsday timer for otherwise
+ * working systems
+ */
+ if (time_after(jiffies, desc->last_unhandled + HZ/10))
+ desc->irqs_unhandled = 1;
+ else
+ desc->irqs_unhandled++;
+ desc->last_unhandled = jiffies;
+ if (unlikely(action_ret != IRQ_NONE))
+ report_bad_irq(irq, desc, action_ret);
+ }
+
+ if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
+ int ok = misrouted_irq(irq);
+ if (action_ret == IRQ_NONE)
+ desc->irqs_unhandled -= ok;
+ }
+
+ desc->irq_count++;
+ if (likely(desc->irq_count < 100000))
+ return;
+
+ desc->irq_count = 0;
+ if (unlikely(desc->irqs_unhandled > 99900)) {
+ /*
+ * The interrupt is stuck
+ */
+ __report_bad_irq(irq, desc, action_ret);
+ /*
+ * Now kill the IRQ
+ */
+ printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+ desc->depth++;
+ desc->chip->disable(irq);
+
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ }
+ desc->irqs_unhandled = 0;
+}
+
+int noirqdebug __read_mostly;
+
+int noirqdebug_setup(char *str)
+{
+ noirqdebug = 1;
+ printk(KERN_INFO "IRQ lockup detection disabled\n");
+
+ return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+module_param(noirqdebug, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
+
+static int __init irqfixup_setup(char *str)
+{
+ irqfixup = 1;
+ printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
+ printk(KERN_WARNING "This may impact system performance.\n");
+
+ return 1;
+}
+
+__setup("irqfixup", irqfixup_setup);
+module_param(irqfixup, int, 0644);
+MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
+
+static int __init irqpoll_setup(char *str)
+{
+ irqfixup = 2;
+ printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
+ "enabled\n");
+ printk(KERN_WARNING "This may significantly impact system "
+ "performance\n");
+ return 1;
+}
+
+__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/itimer.c b/kernel/itimer.c
new file mode 100644
index 0000000..6a5fe93
--- /dev/null
+++ b/kernel/itimer.c
@@ -0,0 +1,282 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
+{
+ ktime_t rem = hrtimer_get_remaining(timer);
+
+ /*
+ * Racy but safe: if the itimer expires after the above
+ * hrtimer_get_remtime() call but before this condition
+ * then we return 0 - which is correct.
+ */
+ if (hrtimer_active(timer)) {
+ if (rem.tv64 <= 0)
+ rem.tv64 = NSEC_PER_USEC;
+ } else
+ rem.tv64 = 0;
+
+ return ktime_to_timeval(rem);
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+ struct task_struct *tsk = current;
+ cputime_t cinterval, cval;
+
+ switch (which) {
+ case ITIMER_REAL:
+ spin_lock_irq(&tsk->sighand->siglock);
+ value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+ value->it_interval =
+ ktime_to_timeval(tsk->signal->it_real_incr);
+ spin_unlock_irq(&tsk->sighand->siglock);
+ break;
+ case ITIMER_VIRTUAL:
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_virt_expires;
+ cinterval = tsk->signal->it_virt_incr;
+ if (!cputime_eq(cval, cputime_zero)) {
+ struct task_cputime cputime;
+ cputime_t utime;
+
+ thread_group_cputime(tsk, &cputime);
+ utime = cputime.utime;
+ if (cputime_le(cval, utime)) { /* about to fire */
+ cval = jiffies_to_cputime(1);
+ } else {
+ cval = cputime_sub(cval, utime);
+ }
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+ break;
+ case ITIMER_PROF:
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_prof_expires;
+ cinterval = tsk->signal->it_prof_incr;
+ if (!cputime_eq(cval, cputime_zero)) {
+ struct task_cputime times;
+ cputime_t ptime;
+
+ thread_group_cputime(tsk, &times);
+ ptime = cputime_add(times.utime, times.stime);
+ if (cputime_le(cval, ptime)) { /* about to fire */
+ cval = jiffies_to_cputime(1);
+ } else {
+ cval = cputime_sub(cval, ptime);
+ }
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+ break;
+ default:
+ return(-EINVAL);
+ }
+ return 0;
+}
+
+SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
+{
+ int error = -EFAULT;
+ struct itimerval get_buffer;
+
+ if (value) {
+ error = do_getitimer(which, &get_buffer);
+ if (!error &&
+ copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+enum hrtimer_restart it_real_fn(struct hrtimer *timer)
+{
+ struct signal_struct *sig =
+ container_of(timer, struct signal_struct, real_timer);
+
+ kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+ struct task_struct *tsk = current;
+ struct hrtimer *timer;
+ ktime_t expires;
+ cputime_t cval, cinterval, nval, ninterval;
+
+ /*
+ * Validate the timevals in value.
+ */
+ if (!timeval_valid(&value->it_value) ||
+ !timeval_valid(&value->it_interval))
+ return -EINVAL;
+
+ switch (which) {
+ case ITIMER_REAL:
+again:
+ spin_lock_irq(&tsk->sighand->siglock);
+ timer = &tsk->signal->real_timer;
+ if (ovalue) {
+ ovalue->it_value = itimer_get_remtime(timer);
+ ovalue->it_interval
+ = ktime_to_timeval(tsk->signal->it_real_incr);
+ }
+ /* We are sharing ->siglock with it_real_fn() */
+ if (hrtimer_try_to_cancel(timer) < 0) {
+ spin_unlock_irq(&tsk->sighand->siglock);
+ goto again;
+ }
+ expires = timeval_to_ktime(value->it_value);
+ if (expires.tv64 != 0) {
+ tsk->signal->it_real_incr =
+ timeval_to_ktime(value->it_interval);
+ hrtimer_start(timer, expires, HRTIMER_MODE_REL);
+ } else
+ tsk->signal->it_real_incr.tv64 = 0;
+
+ spin_unlock_irq(&tsk->sighand->siglock);
+ break;
+ case ITIMER_VIRTUAL:
+ nval = timeval_to_cputime(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_virt_expires;
+ cinterval = tsk->signal->it_virt_incr;
+ if (!cputime_eq(cval, cputime_zero) ||
+ !cputime_eq(nval, cputime_zero)) {
+ if (cputime_gt(nval, cputime_zero))
+ nval = cputime_add(nval,
+ jiffies_to_cputime(1));
+ set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
+ &nval, &cval);
+ }
+ tsk->signal->it_virt_expires = nval;
+ tsk->signal->it_virt_incr = ninterval;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+ break;
+ case ITIMER_PROF:
+ nval = timeval_to_cputime(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_prof_expires;
+ cinterval = tsk->signal->it_prof_incr;
+ if (!cputime_eq(cval, cputime_zero) ||
+ !cputime_eq(nval, cputime_zero)) {
+ if (cputime_gt(nval, cputime_zero))
+ nval = cputime_add(nval,
+ jiffies_to_cputime(1));
+ set_process_cpu_timer(tsk, CPUCLOCK_PROF,
+ &nval, &cval);
+ }
+ tsk->signal->it_prof_expires = nval;
+ tsk->signal->it_prof_incr = ninterval;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds: number of seconds until alarm
+ * 0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+ if (seconds > INT_MAX)
+ seconds = INT_MAX;
+#endif
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+ /*
+ * We can't return 0 if we have an alarm pending ... And we'd
+ * better return too much than too little anyway
+ */
+ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+ it_old.it_value.tv_usec >= 500000)
+ it_old.it_value.tv_sec++;
+
+ return it_old.it_value.tv_sec;
+}
+
+SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
+ struct itimerval __user *, ovalue)
+{
+ struct itimerval set_buffer, get_buffer;
+ int error;
+
+ if (value) {
+ if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+ return -EFAULT;
+ } else
+ memset((char *) &set_buffer, 0, sizeof(set_buffer));
+
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+ if (error || !ovalue)
+ return error;
+
+ if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ return -EFAULT;
+ return 0;
+}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
new file mode 100644
index 0000000..7b8b0f2
--- /dev/null
+++ b/kernel/kallsyms.c
@@ -0,0 +1,487 @@
+/*
+ * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
+ *
+ * Rewritten and vastly simplified by Rusty Russell for in-kernel
+ * module loader:
+ * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * ChangeLog:
+ *
+ * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
+ * Changed the compression method from stem compression to "table lookup"
+ * compression (see scripts/kallsyms.c for a more complete description)
+ */
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h> /* for cond_resched */
+#include <linux/mm.h>
+#include <linux/ctype.h>
+
+#include <asm/sections.h>
+
+#ifdef CONFIG_KALLSYMS_ALL
+#define all_var 1
+#else
+#define all_var 0
+#endif
+
+/* These will be re-linked against their real values during the second link stage */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
+
+/* tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV)
+ */
+extern const unsigned long kallsyms_num_syms
+__attribute__((weak, section(".rodata")));
+
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
+
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
+
+static inline int is_kernel_inittext(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext
+ && addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
+ return 1;
+ return in_gate_area_no_task(addr);
+}
+
+static inline int is_kernel(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+ return 1;
+ return in_gate_area_no_task(addr);
+}
+
+static int is_ksym_addr(unsigned long addr)
+{
+ if (all_var)
+ return is_kernel(addr);
+
+ return is_kernel_text(addr) || is_kernel_inittext(addr);
+}
+
+/* expand a compressed symbol data into the resulting uncompressed string,
+ given the offset to where the symbol is in the compressed stream */
+static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+{
+ int len, skipped_first = 0;
+ const u8 *tptr, *data;
+
+ /* get the compressed symbol length from the first symbol byte */
+ data = &kallsyms_names[off];
+ len = *data;
+ data++;
+
+ /* update the offset to return the offset for the next symbol on
+ * the compressed stream */
+ off += len + 1;
+
+ /* for every byte on the compressed symbol data, copy the table
+ entry for that byte */
+ while(len) {
+ tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+ data++;
+ len--;
+
+ while (*tptr) {
+ if(skipped_first) {
+ *result = *tptr;
+ result++;
+ } else
+ skipped_first = 1;
+ tptr++;
+ }
+ }
+
+ *result = '\0';
+
+ /* return to offset to the next symbol */
+ return off;
+}
+
+/* get symbol type information. This is encoded as a single char at the
+ * begining of the symbol name */
+static char kallsyms_get_symbol_type(unsigned int off)
+{
+ /* get just the first code, look it up in the token table, and return the
+ * first char from this token */
+ return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+}
+
+
+/* find the offset on the compressed stream given and index in the
+ * kallsyms array */
+static unsigned int get_symbol_offset(unsigned long pos)
+{
+ const u8 *name;
+ int i;
+
+ /* use the closest marker we have. We have markers every 256 positions,
+ * so that should be close enough */
+ name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+
+ /* sequentially scan all the symbols up to the point we're searching for.
+ * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+ * just need to add the len to the current pointer for every symbol we
+ * wish to skip */
+ for(i = 0; i < (pos&0xFF); i++)
+ name = name + (*name) + 1;
+
+ return name - kallsyms_names;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long i;
+ unsigned int off;
+
+ for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+ off = kallsyms_expand_symbol(off, namebuf);
+
+ if (strcmp(namebuf, name) == 0)
+ return kallsyms_addresses[i];
+ }
+ return module_kallsyms_lookup_name(name);
+}
+
+static unsigned long get_symbol_pos(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ unsigned long symbol_start = 0, symbol_end = 0;
+ unsigned long i, low, high, mid;
+
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
+ /* do a binary search on the sorted kallsyms_addresses array */
+ low = 0;
+ high = kallsyms_num_syms;
+
+ while (high - low > 1) {
+ mid = low + (high - low) / 2;
+ if (kallsyms_addresses[mid] <= addr)
+ low = mid;
+ else
+ high = mid;
+ }
+
+ /*
+ * search for the first aliased symbol. Aliased
+ * symbols are symbols with the same address
+ */
+ while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ --low;
+
+ symbol_start = kallsyms_addresses[low];
+
+ /* Search for next non-aliased symbol */
+ for (i = low + 1; i < kallsyms_num_syms; i++) {
+ if (kallsyms_addresses[i] > symbol_start) {
+ symbol_end = kallsyms_addresses[i];
+ break;
+ }
+ }
+
+ /* if we found no next symbol, we use the end of the section */
+ if (!symbol_end) {
+ if (is_kernel_inittext(addr))
+ symbol_end = (unsigned long)_einittext;
+ else if (all_var)
+ symbol_end = (unsigned long)_end;
+ else
+ symbol_end = (unsigned long)_etext;
+ }
+
+ if (symbolsize)
+ *symbolsize = symbol_end - symbol_start;
+ if (offset)
+ *offset = addr - symbol_start;
+
+ return low;
+}
+
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ char namebuf[KSYM_NAME_LEN];
+ if (is_ksym_addr(addr))
+ return !!get_symbol_pos(addr, symbolsize, offset);
+
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+}
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel
+ * - we guarantee that the returned name is valid until we reschedule even if
+ * it resides in a module
+ * - we also guarantee that modname will be valid until rescheduled
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ namebuf[KSYM_NAME_LEN - 1] = 0;
+ namebuf[0] = 0;
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, symbolsize, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+ if (modname)
+ *modname = NULL;
+ return namebuf;
+ }
+
+ /* see if it's in a module */
+ return module_address_lookup(addr, symbolsize, offset, modname,
+ namebuf);
+}
+
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+ symname[0] = '\0';
+ symname[KSYM_NAME_LEN - 1] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, NULL, NULL);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_name(addr, symname);
+}
+
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ name[0] = '\0';
+ name[KSYM_NAME_LEN - 1] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, size, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ modname[0] = '\0';
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
+/* Look up a kernel symbol and return it in a text buffer. */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+ char *modname;
+ const char *name;
+ unsigned long offset, size;
+ int len;
+
+ name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
+ if (!name)
+ return sprintf(buffer, "0x%lx", address);
+
+ if (name != buffer)
+ strcpy(buffer, name);
+ len = strlen(buffer);
+ buffer += len;
+
+ if (modname)
+ len += sprintf(buffer, "+%#lx/%#lx [%s]",
+ offset, size, modname);
+ else
+ len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+
+ return len;
+}
+
+/* Look up a kernel symbol and print it to the kernel messages. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+ char buffer[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(buffer, address);
+
+ printk(fmt, buffer);
+}
+
+/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
+struct kallsym_iter
+{
+ loff_t pos;
+ unsigned long value;
+ unsigned int nameoff; /* If iterating in core kernel symbols */
+ char type;
+ char name[KSYM_NAME_LEN];
+ char module_name[MODULE_NAME_LEN];
+ int exported;
+};
+
+static int get_ksymbol_mod(struct kallsym_iter *iter)
+{
+ if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+ &iter->type, iter->name, iter->module_name,
+ &iter->exported) < 0)
+ return 0;
+ return 1;
+}
+
+/* Returns space to next name. */
+static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
+{
+ unsigned off = iter->nameoff;
+
+ iter->module_name[0] = '\0';
+ iter->value = kallsyms_addresses[iter->pos];
+
+ iter->type = kallsyms_get_symbol_type(off);
+
+ off = kallsyms_expand_symbol(off, iter->name);
+
+ return off - iter->nameoff;
+}
+
+static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
+{
+ iter->name[0] = '\0';
+ iter->nameoff = get_symbol_offset(new_pos);
+ iter->pos = new_pos;
+}
+
+/* Returns false if pos at or past end of file. */
+static int update_iter(struct kallsym_iter *iter, loff_t pos)
+{
+ /* Module symbols can be accessed randomly. */
+ if (pos >= kallsyms_num_syms) {
+ iter->pos = pos;
+ return get_ksymbol_mod(iter);
+ }
+
+ /* If we're not on the desired position, reset to new position. */
+ if (pos != iter->pos)
+ reset_iter(iter, pos);
+
+ iter->nameoff += get_ksymbol_core(iter);
+ iter->pos++;
+
+ return 1;
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ (*pos)++;
+
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return p;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return m->private;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kallsym_iter *iter = m->private;
+
+ /* Some debugging symbols have no name. Ignore them. */
+ if (!iter->name[0])
+ return 0;
+
+ if (iter->module_name[0]) {
+ char type;
+
+ /* Label it "global" if it is exported,
+ * "local" if not exported. */
+ type = iter->exported ? toupper(iter->type) :
+ tolower(iter->type);
+ seq_printf(m, "%0*lx %c %s\t[%s]\n",
+ (int)(2*sizeof(void*)),
+ iter->value, type, iter->name, iter->module_name);
+ } else
+ seq_printf(m, "%0*lx %c %s\n",
+ (int)(2*sizeof(void*)),
+ iter->value, iter->type, iter->name);
+ return 0;
+}
+
+static const struct seq_operations kallsyms_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show
+};
+
+static int kallsyms_open(struct inode *inode, struct file *file)
+{
+ /* We keep iterator in m->private, since normal case is to
+ * s_start from where we left off, so we avoid doing
+ * using get_symbol_offset for every symbol */
+ struct kallsym_iter *iter;
+ int ret;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+ reset_iter(iter, 0);
+
+ ret = seq_open(file, &kallsyms_op);
+ if (ret == 0)
+ ((struct seq_file *)file->private_data)->private = iter;
+ else
+ kfree(iter);
+ return ret;
+}
+
+static const struct file_operations kallsyms_operations = {
+ .open = kallsyms_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init kallsyms_init(void)
+{
+ proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
+ return 0;
+}
+__initcall(kallsyms_init);
+
+EXPORT_SYMBOL(__print_symbol);
+EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 0000000..ad257a5
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1499 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsrelease.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/sections.h>
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t* crash_notes;
+
+/* vmcoreinfo stuff */
+unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ size_t segment_bytes;
+ struct kimage *image;
+ unsigned long i;
+ int result;
+
+ /* Allocate a controlling structure */
+ result = -ENOMEM;
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ goto out;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->start = entry;
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unuseable pages */
+ INIT_LIST_HEAD(&image->unuseable_pages);
+
+ /* Read in the segments */
+ image->nr_segments = nr_segments;
+ segment_bytes = nr_segments * sizeof(*segments);
+ result = copy_from_user(image->segment, segments, segment_bytes);
+ if (result)
+ goto out;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addreses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ goto out;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ goto out;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ goto out;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ goto out;
+ }
+
+ result = 0;
+out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+
+}
+
+static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ int result;
+ struct kimage *image;
+
+ /* Allocate and initialize a controlling structure */
+ image = NULL;
+ result = do_kimage_alloc(&image, entry, nr_segments, segments);
+ if (result)
+ goto out;
+
+ *rimage = image;
+
+ /*
+ * Find a location for the control code buffer, and add it
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ result = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ printk(KERN_ERR "Could not allocate control_code_buffer\n");
+ goto out;
+ }
+
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ printk(KERN_ERR "Could not allocate swap buffer\n");
+ goto out;
+ }
+
+ result = 0;
+ out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+}
+
+static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ int result;
+ struct kimage *image;
+ unsigned long i;
+
+ image = NULL;
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
+ result = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ /* Allocate and initialize a controlling structure */
+ result = do_kimage_alloc(&image, entry, nr_segments, segments);
+ if (result)
+ goto out;
+
+ /* Enable the special crash kernel control page
+ * allocation policy.
+ */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) || (mend > crashk_res.end))
+ goto out;
+ }
+
+ /*
+ * Find a location for the control code buffer, and add
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ result = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ printk(KERN_ERR "Could not allocate control_code_buffer\n");
+ goto out;
+ }
+
+ result = 0;
+out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+}
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(GFP_KERNEL, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everyting to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+ break;
+ if (hole_end > crashk_res.end)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ break;
+ }
+ }
+ if (pages)
+ image->control_page = hole_end;
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+ if (result == 0)
+ image->destination = destination;
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+ if (result == 0)
+ image->destination += PAGE_SIZE;
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unuseable pages I have cached */
+ kimage_free_page_list(&image->unuseable_pages);
+
+}
+static void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION)? \
+ phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ }
+ else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unuseable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ else {
+ /* Place the page on the destination list I
+ * will use it later.
+ */
+ list_add(&page->lru, &image->dest_pages);
+ }
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ unsigned long ubytes, mbytes;
+ int result;
+ unsigned char __user *buf;
+
+ result = 0;
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ memset(ptr, 0, PAGE_SIZE);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+ if (mchunk > mbytes)
+ mchunk = mbytes;
+
+ uchunk = mchunk;
+ if (uchunk > ubytes)
+ uchunk = ubytes;
+
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = (result < 0) ? result : -EIO;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ unsigned long ubytes, mbytes;
+ int result;
+ unsigned char __user *buf;
+
+ result = 0;
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+ if (mchunk > mbytes)
+ mchunk = mbytes;
+
+ uchunk = mchunk;
+ if (uchunk > ubytes) {
+ uchunk = ubytes;
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = (result < 0) ? result : -EIO;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ * address space, and very carefully places the data in the
+ * allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ * the devices to shut down. Preventing on-going dmas, and placing
+ * the devices in a consistent state so a later kernel can
+ * reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ * and the copies the image to it's final destination. And
+ * jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+
+static DEFINE_MUTEX(kexec_mutex);
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+ struct kexec_segment __user *, segments, unsigned long, flags)
+{
+ struct kimage **dest_image, *image;
+ int result;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ /*
+ * Verify we have a legal set of flags
+ * This leaves us room for future extensions.
+ */
+ if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+ return -EINVAL;
+
+ /* Verify we are on the appropriate architecture */
+ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+ ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+ return -EINVAL;
+
+ /* Put an artificial cap on the number
+ * of segments passed to kexec_load.
+ */
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ image = NULL;
+ result = 0;
+
+ /* Because we write directly to the reserved memory
+ * region when loading crash kernels we need a mutex here to
+ * prevent multiple crash kernels from attempting to load
+ * simultaneously, and to prevent a crash kernel from loading
+ * over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_ON_CRASH)
+ dest_image = &kexec_crash_image;
+ if (nr_segments > 0) {
+ unsigned long i;
+
+ /* Loading another kernel to reboot into */
+ if ((flags & KEXEC_ON_CRASH) == 0)
+ result = kimage_normal_alloc(&image, entry,
+ nr_segments, segments);
+ /* Loading another kernel to switch to if this one crashes */
+ else if (flags & KEXEC_ON_CRASH) {
+ /* Free any current crash dump kernel before
+ * we corrupt it.
+ */
+ kimage_free(xchg(&kexec_crash_image, NULL));
+ result = kimage_crash_alloc(&image, entry,
+ nr_segments, segments);
+ }
+ if (result)
+ goto out;
+
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
+ result = machine_kexec_prepare(image);
+ if (result)
+ goto out;
+
+ for (i = 0; i < nr_segments; i++) {
+ result = kimage_load_segment(image, &image->segment[i]);
+ if (result)
+ goto out;
+ }
+ kimage_terminate(image);
+ }
+ /* Install the new kernel, and Uninstall the old */
+ image = xchg(dest_image, image);
+
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+
+ return result;
+}
+
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_kexec_load(unsigned long entry,
+ unsigned long nr_segments,
+ struct compat_kexec_segment __user *segments,
+ unsigned long flags)
+{
+ struct compat_kexec_segment in;
+ struct kexec_segment out, __user *ksegments;
+ unsigned long i, result;
+
+ /* Don't allow clients that don't understand the native
+ * architecture to do anything.
+ */
+ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+ return -EINVAL;
+
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ for (i=0; i < nr_segments; i++) {
+ result = copy_from_user(&in, &segments[i], sizeof(in));
+ if (result)
+ return -EFAULT;
+
+ out.buf = compat_ptr(in.buf);
+ out.bufsz = in.bufsz;
+ out.mem = in.mem;
+ out.memsz = in.memsz;
+
+ result = copy_to_user(&ksegments[i], &out, sizeof(out));
+ if (result)
+ return -EFAULT;
+ }
+
+ return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= NR_CPUS))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ crash_notes = alloc_percpu(note_buf_t);
+ if (!crash_notes) {
+ printk("Kexec: Memory allocation for saving cpu register"
+ " states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+module_init(crash_notes_memory_init)
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warning("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("crashkernel: Memory "
+ "value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warning("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warning("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warning("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("Memory value expected "
+ "after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warning("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+
+ return 0;
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+ char *first_colon, *first_space;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, "crashkernel=");
+ while (p) {
+ ck_cmdline = p;
+ p = strstr(p+1, "crashkernel=");
+ }
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += 12; /* strlen("crashkernel=") */
+
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+ else
+ return parse_crashkernel_simple(ck_cmdline, crash_size,
+ crash_base);
+
+ return 0;
+}
+
+
+
+void crash_save_vmcoreinfo(void)
+{
+ u32 *buf;
+
+ if (!vmcoreinfo_size)
+ return;
+
+ vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+
+ buf = (u32 *)vmcoreinfo_note;
+
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+
+ final_note(buf);
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ int r;
+
+ va_start(args, fmt);
+ r = vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ if (r + vmcoreinfo_size > vmcoreinfo_max_size)
+ r = vmcoreinfo_max_size - vmcoreinfo_size;
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmlist);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+
+ arch_crash_save_vmcoreinfo();
+
+ return 0;
+}
+
+module_init(crash_save_vmcoreinfo_init)
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ mutex_lock(&pm_mutex);
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = device_suspend(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Resume_devices;
+ device_pm_lock();
+ local_irq_disable();
+ /* At this point, device_suspend() has been called,
+ * but *not* device_power_down(). We *must*
+ * device_power_down() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = device_power_down(PMSG_FREEZE);
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kernel_restart_prepare(NULL);
+ printk(KERN_EMERG "Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ device_power_up(PMSG_RESTORE);
+ Enable_irqs:
+ local_irq_enable();
+ device_pm_unlock();
+ enable_nonboot_cpus();
+ Resume_devices:
+ device_resume(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ mutex_unlock(&pm_mutex);
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
new file mode 100644
index 0000000..bc41ad0
--- /dev/null
+++ b/kernel/kfifo.c
@@ -0,0 +1,197 @@
+/*
+ * A simple kernel FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/kfifo.h>
+#include <linux/log2.h>
+
+/**
+ * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * @buffer: the preallocated buffer to be used.
+ * @size: the size of the internal buffer, this have to be a power of 2.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
+ * &struct kfifo with kfree().
+ */
+struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+ gfp_t gfp_mask, spinlock_t *lock)
+{
+ struct kfifo *fifo;
+
+ /* size must be a power of 2 */
+ BUG_ON(!is_power_of_2(size));
+
+ fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+ if (!fifo)
+ return ERR_PTR(-ENOMEM);
+
+ fifo->buffer = buffer;
+ fifo->size = size;
+ fifo->in = fifo->out = 0;
+ fifo->lock = lock;
+
+ return fifo;
+}
+EXPORT_SYMBOL(kfifo_init);
+
+/**
+ * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * @size: the size of the internal buffer to be allocated.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * The size will be rounded-up to a power of 2.
+ */
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
+{
+ unsigned char *buffer;
+ struct kfifo *ret;
+
+ /*
+ * round up to the next power of 2, since our 'let the indices
+ * wrap' tachnique works only in this case.
+ */
+ if (size & (size - 1)) {
+ BUG_ON(size > 0x80000000);
+ size = roundup_pow_of_two(size);
+ }
+
+ buffer = kmalloc(size, gfp_mask);
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kfifo_init(buffer, size, gfp_mask, lock);
+
+ if (IS_ERR(ret))
+ kfree(buffer);
+
+ return ret;
+}
+EXPORT_SYMBOL(kfifo_alloc);
+
+/**
+ * kfifo_free - frees the FIFO
+ * @fifo: the fifo to be freed.
+ */
+void kfifo_free(struct kfifo *fifo)
+{
+ kfree(fifo->buffer);
+ kfree(fifo);
+}
+EXPORT_SYMBOL(kfifo_free);
+
+/**
+ * __kfifo_put - puts some data into the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most @len bytes from the @buffer into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_put(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len)
+{
+ unsigned int l;
+
+ len = min(len, fifo->size - fifo->in + fifo->out);
+
+ /*
+ * Ensure that we sample the fifo->out index -before- we
+ * start putting bytes into the kfifo.
+ */
+
+ smp_mb();
+
+ /* first put the data starting from fifo->in to buffer end */
+ l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+ memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+
+ /* then put the rest (if any) at the beginning of the buffer */
+ memcpy(fifo->buffer, buffer + l, len - l);
+
+ /*
+ * Ensure that we add the bytes to the kfifo -before-
+ * we update the fifo->in index.
+ */
+
+ smp_wmb();
+
+ fifo->in += len;
+
+ return len;
+}
+EXPORT_SYMBOL(__kfifo_put);
+
+/**
+ * __kfifo_get - gets some data from the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: where the data must be copied.
+ * @len: the size of the destination buffer.
+ *
+ * This function copies at most @len bytes from the FIFO into the
+ * @buffer and returns the number of copied bytes.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_get(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len)
+{
+ unsigned int l;
+
+ len = min(len, fifo->in - fifo->out);
+
+ /*
+ * Ensure that we sample the fifo->in index -before- we
+ * start removing bytes from the kfifo.
+ */
+
+ smp_rmb();
+
+ /* first get the data from fifo->out until the end of the buffer */
+ l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+ memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+
+ /* then get the rest (if any) from the beginning of the buffer */
+ memcpy(buffer + l, fifo->buffer, len - l);
+
+ /*
+ * Ensure that we remove the bytes from the kfifo -before-
+ * we update the fifo->out index.
+ */
+
+ smp_mb();
+
+ fifo->out += len;
+
+ return len;
+}
+EXPORT_SYMBOL(__kfifo_get);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
new file mode 100644
index 0000000..e4dcfb2
--- /dev/null
+++ b/kernel/kgdb.c
@@ -0,0 +1,1736 @@
+/*
+ * KGDB stub.
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2008 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unaligned.h>
+
+static int kgdb_break_asap;
+
+#define KGDB_MAX_THREAD_QUERY 17
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ unsigned long thr_query;
+ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+};
+
+static struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+} kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+static int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+static struct kgdb_io *kgdb_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t passive_cpu_wait[NR_CPUS];
+static atomic_t cpu_in_kgdb[NR_CPUS];
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+ int err;
+
+ err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+
+ return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+ return probe_kernel_write((char *)addr,
+ (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ char tmp_variable[BREAK_INSTR_SIZE];
+ int err;
+ /* Validate setting the breakpoint and then removing it. In the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ if (err)
+ printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+ "memory destroyed at: %lx", addr);
+ return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+void __weak
+kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+ return;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * GDB remote protocol parser:
+ */
+
+static int hex(char ch)
+{
+ if ((ch >= 'a') && (ch <= 'f'))
+ return ch - 'a' + 10;
+ if ((ch >= '0') && (ch <= '9'))
+ return ch - '0';
+ if ((ch >= 'A') && (ch <= 'F'))
+ return ch - 'A' + 10;
+ return -1;
+}
+
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (kgdb_io_ops->read_char())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = kgdb_io_ops->read_char();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+ buffer[count] = 0;
+
+ if (ch == '#') {
+ xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
+ xmitcsum += hex(kgdb_io_ops->read_char());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ kgdb_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ kgdb_io_ops->write_char('+');
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+ }
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ kgdb_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ kgdb_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ kgdb_io_ops->write_char('#');
+ kgdb_io_ops->write_char(hex_asc_hi(checksum));
+ kgdb_io_ops->write_char(hex_asc_lo(checksum));
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = kgdb_io_ops->read_char();
+
+ if (ch == 3)
+ ch = kgdb_io_ops->read_char();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ kgdb_io_ops->write_char('-');
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+ return;
+ }
+ }
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in buf.
+ * Return a pointer to the last char put in buf (null). May return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (!err) {
+ while (count > 0) {
+ buf = pack_hex_byte(buf, *tmp);
+ tmp++;
+ count--;
+ }
+
+ *buf = 0;
+ }
+
+ return err;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return a pointer to the character after
+ * the last byte written.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int err = 0;
+ char c;
+
+ while (count-- > 0) {
+ c = *buf++;
+ if (c == 0x7d)
+ c = *buf++ ^ 0x20;
+
+ err = probe_kernel_write(mem, &c, 1);
+ if (err)
+ break;
+
+ mem++;
+ }
+
+ return err;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in mem.
+ * Return a pointer to the character AFTER the last byte written.
+ * May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex(*tmp_hex--);
+ *tmp_raw |= hex(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+ int hex_val;
+ int num = 0;
+ int negate = 0;
+
+ *long_val = 0;
+
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
+ while (**ptr) {
+ hex_val = hex(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ if (negate)
+ *long_val = -*long_val;
+
+ return num;
+}
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 16
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ char *limit;
+
+ limit = pkt + BUF_THREAD_ID_SIZE;
+ while (pkt < limit)
+ pkt = pack_hex_byte(pkt, *id++);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ unsigned char *scan;
+ int i = 4;
+
+ scan = (unsigned char *)id;
+ while (i--)
+ *scan++ = 0;
+ put_unaligned_be32(value, scan);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped to the cpu shadow information
+ */
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < 0) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+/*
+ * CPU debug state control:
+ */
+
+#ifdef CONFIG_SMP
+static void kgdb_wait(struct pt_regs *regs)
+{
+ unsigned long flags;
+ int cpu;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ /*
+ * Make sure the above info reaches the primary CPU before
+ * our cpu_in_kgdb[] flag setting does:
+ */
+ smp_wmb();
+ atomic_set(&cpu_in_kgdb[cpu], 1);
+
+ /* Wait till primary CPU is done with debugging */
+ while (atomic_read(&passive_cpu_wait[cpu]))
+ cpu_relax();
+
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
+
+ /* fix up hardware debug registers on local cpu */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+
+ /* Signal the primary CPU that we are done: */
+ atomic_set(&cpu_in_kgdb[cpu], 0);
+ touch_softlockup_watchdog();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm && current->mm->mmap_cache) {
+ flush_cache_range(current->mm->mmap_cache,
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+static int kgdb_activate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_set_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ return error;
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return 0;
+}
+
+static int kgdb_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+static int kgdb_deactivate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ return error;
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return 0;
+}
+
+static int kgdb_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+static int remove_all_break(void)
+{
+ unsigned long addr;
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -raw_smp_processor_id() - 2;
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+static void kgdb_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = pack_hex_byte(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!kgdb_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait)
+ printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+ return 1;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for_each_online_cpu(i) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in __schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned char thref[8];
+ char *ptr;
+ int i;
+ int cpu;
+ int finished = 0;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+
+ i = 0;
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
+ pack_threadid(ptr, thref);
+ ptr += BUF_THREAD_ID_SIZE;
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ pack_threadid(ptr, thref);
+ ptr += BUF_THREAD_ID_SIZE;
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if ((int)ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = kgdb_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = kgdb_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return 0;
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+static int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ if (kgdb_connected) {
+ unsigned char thref[8];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = pack_hex_byte(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kgdb_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (kgdb_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ kgdb_activate_sw_breakpoints();
+ printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+ addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ unsigned long flags;
+ int error = 0;
+ int i, cpu;
+
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->ex_vector = evector;
+ ks->err_code = ecode;
+ ks->kgdb_usethreadid = 0;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ return 0; /* Ouch, double exception ! */
+
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = raw_smp_processor_id();
+
+ /*
+ * Acquire the kgdb_active lock:
+ */
+ while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Do not start the debugger connection on this CPU if the last
+ * instance of the exception handler wanted to come into the
+ * debugger on a different CPU via a single step
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+
+ atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ error = 1;
+ goto kgdb_restore; /* No I/O connection, so resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (kgdb_io_ops->pre_exception)
+ kgdb_io_ops->pre_exception();
+
+ kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
+ kgdb_info[ks->cpu].task = current;
+
+ kgdb_disable_hw_debug(ks->linux_regs);
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step) {
+ for (i = 0; i < NR_CPUS; i++)
+ atomic_set(&passive_cpu_wait[i], 1);
+ }
+
+ /*
+ * spin_lock code is good enough as a barrier so we don't
+ * need one here:
+ */
+ atomic_set(&cpu_in_kgdb[ks->cpu], 1);
+
+#ifdef CONFIG_SMP
+ /* Signal the other CPUs to enter kgdb_wait() */
+ if ((!kgdb_single_step) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ for_each_online_cpu(i) {
+ while (!atomic_read(&cpu_in_kgdb[i]))
+ cpu_relax();
+ }
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
+ kgdb_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = current;
+ exception_level = 0;
+
+ /* Talk to debugger with gdbserial protocol */
+ error = gdb_serial_stub(ks);
+
+ /* Call the I/O driver's post_exception routine */
+ if (kgdb_io_ops->post_exception)
+ kgdb_io_ops->post_exception();
+
+ kgdb_info[ks->cpu].debuggerinfo = NULL;
+ kgdb_info[ks->cpu].task = NULL;
+ atomic_set(&cpu_in_kgdb[ks->cpu], 0);
+
+ if (!kgdb_single_step) {
+ for (i = NR_CPUS-1; i >= 0; i--)
+ atomic_set(&passive_cpu_wait[i], 0);
+ /*
+ * Wait till all the CPUs have quit
+ * from the debugger.
+ */
+ for_each_online_cpu(i) {
+ while (atomic_read(&cpu_in_kgdb[i]))
+ cpu_relax();
+ }
+ }
+
+kgdb_restore:
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog();
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ return error;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+ atomic_read(&kgdb_active) != cpu &&
+ atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
+ kgdb_wait((struct pt_regs *)regs);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+ return;
+
+ local_irq_save(flags);
+ kgdb_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_gdb(int key, struct tty_struct *tty)
+{
+ if (!kgdb_io_ops) {
+ printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected)
+ printk(KERN_CRIT "Entering KGDB\n");
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_gdb_op = {
+ .handler = sysrq_handle_gdb,
+ .help_msg = "Gdb",
+ .action_msg = "GDB",
+};
+#endif
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_gdb_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_gdb_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_kgdb_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (kgdb_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_ERR "kgdb: Another I/O driver is already "
+ "registered with KGDB.\n");
+ return -EBUSY;
+ }
+
+ if (new_kgdb_io_ops->init) {
+ err = new_kgdb_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ kgdb_io_ops = new_kgdb_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+ new_kgdb_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_kgdb_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
+ kgdb_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO
+ "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ old_kgdb_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+ atomic_set(&kgdb_setting_breakpoint, 1);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_set(&kgdb_setting_breakpoint, 0);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 0000000..3d3c3ea
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,510 @@
+/*
+ kmod, the new module loader (replaces kerneld)
+ Kirk Petersen
+
+ Reorganized not to be a daemon by Adam Richter, with guidance
+ from Greg Zornetzer.
+
+ Modified to avoid chroot and file sharing problems.
+ Mikael Pettersson
+
+ Limit the concurrent number of kmod modprobes to catch loops from
+ "modprobe needs a service that is in a module".
+ Keith Owens <kaos@ocs.com.au> December 1999
+
+ Unblock all signals when we exec a usermode process.
+ Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+
+ call_usermodehelper wait flag, and remove exec_usermodehelper.
+ Rusty Russell <rusty@rustcorp.com.au> Jan 2003
+*/
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/mnt_namespace.h>
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <asm/uaccess.h>
+
+extern int max_threads;
+
+static struct workqueue_struct *khelper_wq;
+
+#ifdef CONFIG_MODULES
+
+/*
+ modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+
+/**
+ * request_module - try to load a kernel module
+ * @fmt: printf style format string for the name of the module
+ * @varargs: arguements as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+int request_module(const char *fmt, ...)
+{
+ va_list args;
+ char module_name[MODULE_NAME_LEN];
+ unsigned int max_modprobes;
+ int ret;
+ char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL };
+ static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
+ static int kmod_loop_msg;
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (ret >= MODULE_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ /* If modprobe needs a service that is in a module, we get a recursive
+ * loop. Limit the number of running kmod threads to max_threads/2 or
+ * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
+ * would be to run the parents of this process, counting how many times
+ * kmod was invoked. That would mean accessing the internals of the
+ * process tables to get the command line, proc_pid_cmdline is static
+ * and it is not worth changing the proc code just to handle this case.
+ * KAO.
+ *
+ * "trace the ppid" is simple, but will fail if someone's
+ * parent exits. I think this is as good as it gets. --RR
+ */
+ max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
+ atomic_inc(&kmod_concurrent);
+ if (atomic_read(&kmod_concurrent) > max_modprobes) {
+ /* We may be blaming an innocent here, but unlikely */
+ if (kmod_loop_msg++ < 5)
+ printk(KERN_ERR
+ "request_module: runaway loop modprobe %s\n",
+ module_name);
+ atomic_dec(&kmod_concurrent);
+ return -ENOMEM;
+ }
+
+ ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+ atomic_dec(&kmod_concurrent);
+ return ret;
+}
+EXPORT_SYMBOL(request_module);
+#endif /* CONFIG_MODULES */
+
+struct subprocess_info {
+ struct work_struct work;
+ struct completion *complete;
+ char *path;
+ char **argv;
+ char **envp;
+ struct key *ring;
+ enum umh_wait wait;
+ int retval;
+ struct file *stdin;
+ void (*cleanup)(char **argv, char **envp);
+};
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct key *new_session, *old_session;
+ int retval;
+
+ /* Unblock all signals and set the session keyring. */
+ new_session = key_get(sub_info->ring);
+ spin_lock_irq(&current->sighand->siglock);
+ old_session = __install_session_keyring(current, new_session);
+ flush_signal_handlers(current, 1);
+ sigemptyset(&current->blocked);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ key_put(old_session);
+
+ /* Install input pipe when needed */
+ if (sub_info->stdin) {
+ struct files_struct *f = current->files;
+ struct fdtable *fdt;
+ /* no races because files should be private here */
+ sys_close(0);
+ fd_install(0, sub_info->stdin);
+ spin_lock(&f->file_lock);
+ fdt = files_fdtable(f);
+ FD_SET(0, fdt->open_fds);
+ FD_CLR(0, fdt->close_on_exec);
+ spin_unlock(&f->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
+ }
+
+ /* We can run anywhere, unlike our parent keventd(). */
+ set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+
+ /*
+ * Our parent is keventd, which runs with elevated scheduling priority.
+ * Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
+
+ /* Exec failed? */
+ sub_info->retval = retval;
+ do_exit(0);
+}
+
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info->argv, info->envp);
+ kfree(info);
+}
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+
+/* Keventd can't block, but this (a child) can. */
+static int wait_for_helper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ pid_t pid;
+
+ /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
+ * populate the status, but will return -ECHILD. */
+ allow_signal(SIGCHLD);
+
+ pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret;
+
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But wait_for_helper() always runs as keventd, and put_user()
+ * to a kernel address works OK for kernel threads, due to their
+ * having an mm_segment_t which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either ____call_usermodehelper failed and the
+ * real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ if (sub_info->wait == UMH_NO_WAIT)
+ call_usermodehelper_freeinfo(sub_info);
+ else
+ complete(sub_info->complete);
+ return 0;
+}
+
+/* This is run by khelper thread */
+static void __call_usermodehelper(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+ pid_t pid;
+ enum umh_wait wait = sub_info->wait;
+
+ /* CLONE_VFORK: wait until the usermode helper has execve'd
+ * successfully We need the data structures to stay around
+ * until that is done. */
+ if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
+ pid = kernel_thread(wait_for_helper, sub_info,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ CLONE_VFORK | SIGCHLD);
+
+ switch (wait) {
+ case UMH_NO_WAIT:
+ break;
+
+ case UMH_WAIT_PROC:
+ if (pid > 0)
+ break;
+ sub_info->retval = pid;
+ /* FALLTHROUGH */
+
+ case UMH_WAIT_EXEC:
+ complete(sub_info->complete);
+ }
+}
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ */
+static int usermodehelper_disabled;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+/**
+ * usermodehelper_disable - prevent new helpers from being started
+ */
+int usermodehelper_disable(void)
+{
+ long retval;
+
+ usermodehelper_disabled = 1;
+ smp_mb();
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ usermodehelper_disabled = 0;
+ return -EAGAIN;
+}
+
+/**
+ * usermodehelper_enable - allow new helpers to be started again
+ */
+void usermodehelper_enable(void)
+{
+ usermodehelper_disabled = 0;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic_inc();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+#else /* CONFIG_PM_SLEEP */
+#define usermodehelper_disabled 0
+
+static inline void helper_lock(void) {}
+static inline void helper_unlock(void) {}
+#endif /* CONFIG_PM_SLEEP */
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+ char **envp, gfp_t gfp_mask)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, __call_usermodehelper);
+ sub_info->path = path;
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_setkeys - set the session keys for usermode helper
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @session_keyring: the session keyring for the process
+ */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+ struct key *session_keyring)
+{
+ info->ring = session_keyring;
+}
+EXPORT_SYMBOL(call_usermodehelper_setkeys);
+
+/**
+ * call_usermodehelper_setcleanup - set a cleanup function
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @cleanup: a cleanup function
+ *
+ * The cleanup function is just befor ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+ void (*cleanup)(char **argv, char **envp))
+{
+ info->cleanup = cleanup;
+}
+EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+
+/**
+ * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
+ * @sub_info: a subprocess_info returned by call_usermodehelper_setup
+ * @filp: set to the write-end of a pipe
+ *
+ * This constructs a pipe, and sets the read end to be the stdin of the
+ * subprocess, and returns the write-end in *@filp.
+ */
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+ struct file **filp)
+{
+ struct file *f;
+
+ f = create_write_pipe(0);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ *filp = f;
+
+ f = create_read_pipe(f, 0);
+ if (IS_ERR(f)) {
+ free_write_pipe(*filp);
+ return PTR_ERR(f);
+ }
+ sub_info->stdin = f;
+
+ return 0;
+}
+EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when -1 don't wait at all, but you get no useful error back when
+ * the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of keventd.
+ * (ie. it runs with full root capabilities).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+ enum umh_wait wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ helper_lock();
+ if (sub_info->path[0] == '\0')
+ goto out;
+
+ if (!khelper_wq || usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ sub_info->complete = &done;
+ sub_info->wait = wait;
+
+ queue_work(khelper_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+ wait_for_completion(&done);
+ retval = sub_info->retval;
+
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @filp: set to the write-end of a pipe
+ *
+ * This is a simple wrapper which executes a usermode-helper function
+ * with a pipe as stdin. It is implemented entirely in terms of
+ * lower-level call_usermodehelper_* functions.
+ */
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+ struct file **filp)
+{
+ struct subprocess_info *sub_info;
+ int ret;
+
+ sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
+ if (sub_info == NULL)
+ return -ENOMEM;
+
+ ret = call_usermodehelper_stdinpipe(sub_info, filp);
+ if (ret < 0)
+ goto out;
+
+ return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+
+ out:
+ call_usermodehelper_freeinfo(sub_info);
+ return ret;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
+void __init usermodehelper_init(void)
+{
+ khelper_wq = create_singlethread_workqueue("khelper");
+ BUG_ON(!khelper_wq);
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
new file mode 100644
index 0000000..9f8a3f2
--- /dev/null
+++ b/kernel/kprobes.c
@@ -0,0 +1,1355 @@
+/*
+ * Kernel Probes (KProbes)
+ * kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation (includes suggestions from
+ * Rusty Russell).
+ * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ * hlists and exceptions notifier as suggested by Andi Kleen.
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ * exceptions notifier to be first on the priority list.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ */
+#include <linux/kprobes.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/kallsyms.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kdebug.h>
+
+#include <asm-generic/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+
+/*
+ * Some oddball architectures like 64bit powerpc have function descriptors
+ * so this must be overridable.
+ */
+#ifndef kprobe_lookup_name
+#define kprobe_lookup_name(name, addr) \
+ addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#endif
+
+static int kprobes_initialized;
+static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
+
+DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+ spinlock_t lock ____cacheline_aligned_in_smp;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+ return &(kretprobe_table_locks[hash].lock);
+}
+
+/*
+ * Normally, functions that we'd want to prohibit kprobes in, are marked
+ * __kprobes. But, there are cases where such functions already belong to
+ * a different section (__sched for preempt_schedule)
+ *
+ * For such cases, we now have a blacklist
+ */
+static struct kprobe_blackpoint kprobe_blacklist[] = {
+ {"preempt_schedule",},
+ {NULL} /* Terminator */
+};
+
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be
+ * single-stepped. x86_64, POWER4 and above have no-exec support and
+ * stepping on the instruction on a vmalloced/kmalloced/data page
+ * is a recipe for disaster
+ */
+#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe_insn_page {
+ struct hlist_node hlist;
+ kprobe_opcode_t *insns; /* Page of instruction slots */
+ char slot_used[INSNS_PER_PAGE];
+ int nused;
+ int ngarbage;
+};
+
+enum kprobe_slot_state {
+ SLOT_CLEAN = 0,
+ SLOT_DIRTY = 1,
+ SLOT_USED = 2,
+};
+
+static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+ int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+ ret = freeze_processes();
+ if (ret == 0) {
+ struct task_struct *p, *q;
+ do_each_thread(p, q) {
+ if (p != current && p->state == TASK_RUNNING &&
+ p->pid != 0) {
+ printk("Check failed: %s is running\n",p->comm);
+ ret = -1;
+ goto loop_end;
+ }
+ } while_each_thread(p, q);
+ }
+loop_end:
+ thaw_processes();
+#else
+ synchronize_sched();
+#endif
+ return ret;
+}
+
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+kprobe_opcode_t __kprobes *get_insn_slot(void)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos;
+
+ retry:
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ if (kip->nused < INSNS_PER_PAGE) {
+ int i;
+ for (i = 0; i < INSNS_PER_PAGE; i++) {
+ if (kip->slot_used[i] == SLOT_CLEAN) {
+ kip->slot_used[i] = SLOT_USED;
+ kip->nused++;
+ return kip->insns + (i * MAX_INSN_SIZE);
+ }
+ }
+ /* Surprise! No unused slots. Fix kip->nused. */
+ kip->nused = INSNS_PER_PAGE;
+ }
+ }
+
+ /* If there are any garbage slots, collect it and try again. */
+ if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+ goto retry;
+ }
+ /* All out of space. Need to allocate a new page. Use slot 0. */
+ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+ if (!kip)
+ return NULL;
+
+ /*
+ * Use module_alloc so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * so x86_64 can correctly handle the %rip-relative fixups.
+ */
+ kip->insns = module_alloc(PAGE_SIZE);
+ if (!kip->insns) {
+ kfree(kip);
+ return NULL;
+ }
+ INIT_HLIST_NODE(&kip->hlist);
+ hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+ memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
+ kip->slot_used[0] = SLOT_USED;
+ kip->nused = 1;
+ kip->ngarbage = 0;
+ return kip->insns;
+}
+
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+ kip->slot_used[idx] = SLOT_CLEAN;
+ kip->nused--;
+ if (kip->nused == 0) {
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ hlist_del(&kip->hlist);
+ if (hlist_empty(&kprobe_insn_pages)) {
+ INIT_HLIST_NODE(&kip->hlist);
+ hlist_add_head(&kip->hlist,
+ &kprobe_insn_pages);
+ } else {
+ module_free(NULL, kip->insns);
+ kfree(kip);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos, *next;
+
+ /* Ensure no-one is preepmted on the garbages */
+ if (check_safety() != 0)
+ return -EAGAIN;
+
+ hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
+ int i;
+ if (kip->ngarbage == 0)
+ continue;
+ kip->ngarbage = 0; /* we will collect all garbages */
+ for (i = 0; i < INSNS_PER_PAGE; i++) {
+ if (kip->slot_used[i] == SLOT_DIRTY &&
+ collect_one_slot(kip, i))
+ break;
+ }
+ }
+ kprobe_garbage_slots = 0;
+ return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos;
+
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ if (kip->insns <= slot &&
+ slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+ int i = (slot - kip->insns) / MAX_INSN_SIZE;
+ if (dirty) {
+ kip->slot_used[i] = SLOT_DIRTY;
+ kip->ngarbage++;
+ } else {
+ collect_one_slot(kip, i);
+ }
+ break;
+ }
+ }
+
+ if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
+ collect_garbage_slots();
+}
+#endif
+
+/* We have preemption disabled.. so it is safe to use __ versions */
+static inline void set_kprobe_instance(struct kprobe *kp)
+{
+ __get_cpu_var(kprobe_instance) = kp;
+}
+
+static inline void reset_kprobe_instance(void)
+{
+ __get_cpu_var(kprobe_instance) = NULL;
+}
+
+/*
+ * This routine is called either:
+ * - under the kprobe_mutex - during kprobe_[un]register()
+ * OR
+ * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ */
+struct kprobe __kprobes *get_kprobe(void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->pre_handler) {
+ set_kprobe_instance(kp);
+ if (kp->pre_handler(kp, regs))
+ return 1;
+ }
+ reset_kprobe_instance();
+ }
+ return 0;
+}
+
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->post_handler) {
+ set_kprobe_instance(kp);
+ kp->post_handler(kp, regs, flags);
+ reset_kprobe_instance();
+ }
+ }
+}
+
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
+{
+ struct kprobe *cur = __get_cpu_var(kprobe_instance);
+
+ /*
+ * if we faulted "during" the execution of a user specified
+ * probe handler, invoke just that probe's fault handler
+ */
+ if (cur && cur->fault_handler) {
+ if (cur->fault_handler(cur, regs, trapnr))
+ return 1;
+ }
+ return 0;
+}
+
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *cur = __get_cpu_var(kprobe_instance);
+ int ret = 0;
+
+ if (cur && cur->break_handler) {
+ if (cur->break_handler(cur, regs))
+ ret = 1;
+ }
+ reset_kprobe_instance();
+ return ret;
+}
+
+/* Walks the list and increments nmissed count for multiprobe case */
+void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+{
+ struct kprobe *kp;
+ if (p->pre_handler != aggr_pre_handler) {
+ p->nmissed++;
+ } else {
+ list_for_each_entry_rcu(kp, &p->list, list)
+ kp->nmissed++;
+ }
+ return;
+}
+
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
+{
+ struct kretprobe *rp = ri->rp;
+
+ /* remove rp inst off the rprobe_inst_table */
+ hlist_del(&ri->hlist);
+ INIT_HLIST_NODE(&ri->hlist);
+ if (likely(rp)) {
+ spin_lock(&rp->lock);
+ hlist_add_head(&ri->hlist, &rp->free_instances);
+ spin_unlock(&rp->lock);
+ } else
+ /* Unregistering */
+ hlist_add_head(&ri->hlist, head);
+}
+
+void kretprobe_hash_lock(struct task_struct *tsk,
+ struct hlist_head **head, unsigned long *flags)
+{
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ spinlock_t *hlist_lock;
+
+ *head = &kretprobe_inst_table[hash];
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_lock_irqsave(hlist_lock, *flags);
+}
+
+static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+{
+ spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ spinlock_t *hlist_lock;
+
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+ spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+/*
+ * This function is called from finish_task_switch when task tk becomes dead,
+ * so that we can recycle any function-return probe instances associated
+ * with this task. These left over instances represent probed functions
+ * that have been called but will never return.
+ */
+void __kprobes kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *node, *tmp;
+ unsigned long hash, flags = 0;
+
+ if (unlikely(!kprobes_initialized))
+ /* Early boot. kretprobe_table_locks not yet initialized. */
+ return;
+
+ hash = hash_ptr(tk, KPROBE_HASH_BITS);
+ head = &kretprobe_inst_table[hash];
+ kretprobe_table_lock(hash, &flags);
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task == tk)
+ recycle_rp_inst(ri, &empty_rp);
+ }
+ kretprobe_table_unlock(hash, &flags);
+ INIT_HLIST_HEAD(&empty_rp);
+ hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+}
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ struct hlist_node *pos, *next;
+
+ hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+}
+
+static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+{
+ unsigned long flags, hash;
+ struct kretprobe_instance *ri;
+ struct hlist_node *pos, *next;
+ struct hlist_head *head;
+
+ /* No race here */
+ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+ kretprobe_table_lock(hash, &flags);
+ head = &kretprobe_inst_table[hash];
+ hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ if (ri->rp == rp)
+ ri->rp = NULL;
+ }
+ kretprobe_table_unlock(hash, &flags);
+ }
+ free_rp_inst(rp);
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+ memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+/*
+* Add the new probe to old_p->list. Fail if this is the
+* second jprobe at the address - two jprobes can't coexist
+*/
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+ if (p->break_handler) {
+ if (old_p->break_handler)
+ return -EEXIST;
+ list_add_tail_rcu(&p->list, &old_p->list);
+ old_p->break_handler = aggr_break_handler;
+ } else
+ list_add_rcu(&p->list, &old_p->list);
+ if (p->post_handler && !old_p->post_handler)
+ old_p->post_handler = aggr_post_handler;
+ return 0;
+}
+
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+ copy_kprobe(p, ap);
+ flush_insn_slot(ap);
+ ap->addr = p->addr;
+ ap->pre_handler = aggr_pre_handler;
+ ap->fault_handler = aggr_fault_handler;
+ if (p->post_handler)
+ ap->post_handler = aggr_post_handler;
+ if (p->break_handler)
+ ap->break_handler = aggr_break_handler;
+
+ INIT_LIST_HEAD(&ap->list);
+ list_add_rcu(&p->list, &ap->list);
+
+ hlist_replace_rcu(&p->hlist, &ap->hlist);
+}
+
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ */
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+ struct kprobe *p)
+{
+ int ret = 0;
+ struct kprobe *ap;
+
+ if (old_p->pre_handler == aggr_pre_handler) {
+ copy_kprobe(old_p, p);
+ ret = add_new_kprobe(old_p, p);
+ } else {
+ ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+ if (!ap)
+ return -ENOMEM;
+ add_aggr_kprobe(ap, old_p);
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+ }
+ return ret;
+}
+
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+ struct kprobe_blackpoint *kb;
+
+ if (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end)
+ return -EINVAL;
+ /*
+ * If there exists a kprobe_blacklist, verify and
+ * fail any probe registration in the prohibited area
+ */
+ for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
+ if (kb->start_addr) {
+ if (addr >= kb->start_addr &&
+ addr < (kb->start_addr + kb->range))
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ */
+static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+{
+ kprobe_opcode_t *addr = p->addr;
+ if (p->symbol_name) {
+ if (addr)
+ return NULL;
+ kprobe_lookup_name(p->symbol_name, addr);
+ }
+
+ if (!addr)
+ return NULL;
+ return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+}
+
+static int __kprobes __register_kprobe(struct kprobe *p,
+ unsigned long called_from)
+{
+ int ret = 0;
+ struct kprobe *old_p;
+ struct module *probed_mod;
+ kprobe_opcode_t *addr;
+
+ addr = kprobe_addr(p);
+ if (!addr)
+ return -EINVAL;
+ p->addr = addr;
+
+ preempt_disable();
+ if (!__kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr)) {
+ preempt_enable();
+ return -EINVAL;
+ }
+
+ p->mod_refcounted = 0;
+
+ /*
+ * Check if are we probing a module.
+ */
+ probed_mod = __module_text_address((unsigned long) p->addr);
+ if (probed_mod) {
+ struct module *calling_mod;
+ calling_mod = __module_text_address(called_from);
+ /*
+ * We must allow modules to probe themself and in this case
+ * avoid incrementing the module refcount, so as to allow
+ * unloading of self probing modules.
+ */
+ if (calling_mod && calling_mod != probed_mod) {
+ if (unlikely(!try_module_get(probed_mod))) {
+ preempt_enable();
+ return -EINVAL;
+ }
+ p->mod_refcounted = 1;
+ } else
+ probed_mod = NULL;
+ }
+ preempt_enable();
+
+ p->nmissed = 0;
+ INIT_LIST_HEAD(&p->list);
+ mutex_lock(&kprobe_mutex);
+ old_p = get_kprobe(p->addr);
+ if (old_p) {
+ ret = register_aggr_kprobe(old_p, p);
+ goto out;
+ }
+
+ ret = arch_prepare_kprobe(p);
+ if (ret)
+ goto out;
+
+ INIT_HLIST_NODE(&p->hlist);
+ hlist_add_head_rcu(&p->hlist,
+ &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ if (kprobe_enabled)
+ arch_arm_kprobe(p);
+
+out:
+ mutex_unlock(&kprobe_mutex);
+
+ if (ret && probed_mod)
+ module_put(probed_mod);
+ return ret;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+ struct kprobe *old_p, *list_p;
+
+ old_p = get_kprobe(p->addr);
+ if (unlikely(!old_p))
+ return -EINVAL;
+
+ if (p != old_p) {
+ list_for_each_entry_rcu(list_p, &old_p->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ goto valid_p;
+ return -EINVAL;
+ }
+valid_p:
+ if (old_p == p ||
+ (old_p->pre_handler == aggr_pre_handler &&
+ list_is_singular(&old_p->list))) {
+ /*
+ * Only probe on the hash list. Disarm only if kprobes are
+ * enabled - otherwise, the breakpoint would already have
+ * been removed. We save on flushing icache.
+ */
+ if (kprobe_enabled)
+ arch_disarm_kprobe(p);
+ hlist_del_rcu(&old_p->hlist);
+ } else {
+ if (p->break_handler)
+ old_p->break_handler = NULL;
+ if (p->post_handler) {
+ list_for_each_entry_rcu(list_p, &old_p->list, list) {
+ if ((list_p != p) && (list_p->post_handler))
+ goto noclean;
+ }
+ old_p->post_handler = NULL;
+ }
+noclean:
+ list_del_rcu(&p->list);
+ }
+ return 0;
+}
+
+static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+{
+ struct module *mod;
+ struct kprobe *old_p;
+
+ if (p->mod_refcounted) {
+ /*
+ * Since we've already incremented refcount,
+ * we don't need to disable preemption.
+ */
+ mod = module_text_address((unsigned long)p->addr);
+ if (mod)
+ module_put(mod);
+ }
+
+ if (list_empty(&p->list) || list_is_singular(&p->list)) {
+ if (!list_empty(&p->list)) {
+ /* "p" is the last child of an aggr_kprobe */
+ old_p = list_entry(p->list.next, struct kprobe, list);
+ list_del(&p->list);
+ kfree(old_p);
+ }
+ arch_remove_kprobe(p);
+ }
+}
+
+static int __register_kprobes(struct kprobe **kps, int num,
+ unsigned long called_from)
+{
+ int i, ret = 0;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ ret = __register_kprobe(kps[i], called_from);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kprobes(kps, i);
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Registration and unregistration functions for kprobe.
+ */
+int __kprobes register_kprobe(struct kprobe *p)
+{
+ return __register_kprobes(&p, 1,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kprobe(struct kprobe *p)
+{
+ unregister_kprobes(&p, 1);
+}
+
+int __kprobes register_kprobes(struct kprobe **kps, int num)
+{
+ return __register_kprobes(kps, num,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(kps[i]) < 0)
+ kps[i]->addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++)
+ if (kps[i]->addr)
+ __unregister_kprobe_bottom(kps[i]);
+}
+
+static struct notifier_block kprobe_exceptions_nb = {
+ .notifier_call = kprobe_exceptions_notify,
+ .priority = 0x7fffffff /* we need to be notified first */
+};
+
+unsigned long __weak arch_deref_entry_point(void *entry)
+{
+ return (unsigned long)entry;
+}
+
+static int __register_jprobes(struct jprobe **jps, int num,
+ unsigned long called_from)
+{
+ struct jprobe *jp;
+ int ret = 0, i;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ unsigned long addr;
+ jp = jps[i];
+ addr = arch_deref_entry_point(jp->entry);
+
+ if (!kernel_text_address(addr))
+ ret = -EINVAL;
+ else {
+ /* Todo: Verify probepoint is a function entry point */
+ jp->kp.pre_handler = setjmp_pre_handler;
+ jp->kp.break_handler = longjmp_break_handler;
+ ret = __register_kprobe(&jp->kp, called_from);
+ }
+ if (ret < 0) {
+ if (i > 0)
+ unregister_jprobes(jps, i);
+ break;
+ }
+ }
+ return ret;
+}
+
+int __kprobes register_jprobe(struct jprobe *jp)
+{
+ return __register_jprobes(&jp, 1,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_jprobe(struct jprobe *jp)
+{
+ unregister_jprobes(&jp, 1);
+}
+
+int __kprobes register_jprobes(struct jprobe **jps, int num)
+{
+ return __register_jprobes(jps, num,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(&jps[i]->kp) < 0)
+ jps[i]->kp.addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++) {
+ if (jps[i]->kp.addr)
+ __unregister_kprobe_bottom(&jps[i]->kp);
+ }
+}
+
+#ifdef CONFIG_KRETPROBES
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+ struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ unsigned long hash, flags = 0;
+ struct kretprobe_instance *ri;
+
+ /*TODO: consider to only swap the RA after the last pre_handler fired */
+ hash = hash_ptr(current, KPROBE_HASH_BITS);
+ spin_lock_irqsave(&rp->lock, flags);
+ if (!hlist_empty(&rp->free_instances)) {
+ ri = hlist_entry(rp->free_instances.first,
+ struct kretprobe_instance, hlist);
+ hlist_del(&ri->hlist);
+ spin_unlock_irqrestore(&rp->lock, flags);
+
+ ri->rp = rp;
+ ri->task = current;
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ spin_unlock_irqrestore(&rp->lock, flags);
+ return 0;
+ }
+
+ arch_prepare_kretprobe(ri, regs);
+
+ /* XXX(hch): why is there no hlist_move_head? */
+ INIT_HLIST_NODE(&ri->hlist);
+ kretprobe_table_lock(hash, &flags);
+ hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+ kretprobe_table_unlock(hash, &flags);
+ } else {
+ rp->nmissed++;
+ spin_unlock_irqrestore(&rp->lock, flags);
+ }
+ return 0;
+}
+
+static int __kprobes __register_kretprobe(struct kretprobe *rp,
+ unsigned long called_from)
+{
+ int ret = 0;
+ struct kretprobe_instance *inst;
+ int i;
+ void *addr;
+
+ if (kretprobe_blacklist_size) {
+ addr = kprobe_addr(&rp->kp);
+ if (!addr)
+ return -EINVAL;
+
+ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
+ if (kretprobe_blacklist[i].addr == addr)
+ return -EINVAL;
+ }
+ }
+
+ rp->kp.pre_handler = pre_handler_kretprobe;
+ rp->kp.post_handler = NULL;
+ rp->kp.fault_handler = NULL;
+ rp->kp.break_handler = NULL;
+
+ /* Pre-allocate memory for max kretprobe instances */
+ if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+ rp->maxactive = max(10, 2 * NR_CPUS);
+#else
+ rp->maxactive = NR_CPUS;
+#endif
+ }
+ spin_lock_init(&rp->lock);
+ INIT_HLIST_HEAD(&rp->free_instances);
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kmalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
+ if (inst == NULL) {
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+ INIT_HLIST_NODE(&inst->hlist);
+ hlist_add_head(&inst->hlist, &rp->free_instances);
+ }
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = __register_kprobe(&rp->kp, called_from);
+ if (ret != 0)
+ free_rp_inst(rp);
+ return ret;
+}
+
+static int __register_kretprobes(struct kretprobe **rps, int num,
+ unsigned long called_from)
+{
+ int ret = 0, i;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ ret = __register_kretprobe(rps[i], called_from);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kretprobes(rps, i);
+ break;
+ }
+ }
+ return ret;
+}
+
+int __kprobes register_kretprobe(struct kretprobe *rp)
+{
+ return __register_kretprobes(&rp, 1,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
+{
+ unregister_kretprobes(&rp, 1);
+}
+
+int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+{
+ return __register_kretprobes(rps, num,
+ (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(&rps[i]->kp) < 0)
+ rps[i]->kp.addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++) {
+ if (rps[i]->kp.addr) {
+ __unregister_kprobe_bottom(&rps[i]->kp);
+ cleanup_rp_inst(rps[i]);
+ }
+ }
+}
+
+#else /* CONFIG_KRETPROBES */
+int __kprobes register_kretprobe(struct kretprobe *rp)
+{
+ return -ENOSYS;
+}
+
+int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+{
+ return -ENOSYS;
+}
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
+{
+}
+
+void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+{
+}
+
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+ struct pt_regs *regs)
+{
+ return 0;
+}
+
+#endif /* CONFIG_KRETPROBES */
+
+static int __init init_kprobes(void)
+{
+ int i, err = 0;
+ unsigned long offset = 0, size = 0;
+ char *modname, namebuf[128];
+ const char *symbol_name;
+ void *addr;
+ struct kprobe_blackpoint *kb;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ INIT_HLIST_HEAD(&kprobe_table[i]);
+ INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+ spin_lock_init(&(kretprobe_table_locks[i].lock));
+ }
+
+ /*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+ for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
+ kprobe_lookup_name(kb->name, addr);
+ if (!addr)
+ continue;
+
+ kb->start_addr = (unsigned long)addr;
+ symbol_name = kallsyms_lookup(kb->start_addr,
+ &size, &offset, &modname, namebuf);
+ if (!symbol_name)
+ kb->range = 0;
+ else
+ kb->range = size;
+ }
+
+ if (kretprobe_blacklist_size) {
+ /* lookup the function address from its name */
+ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
+ kprobe_lookup_name(kretprobe_blacklist[i].name,
+ kretprobe_blacklist[i].addr);
+ if (!kretprobe_blacklist[i].addr)
+ printk("kretprobe: lookup failed: %s\n",
+ kretprobe_blacklist[i].name);
+ }
+ }
+
+ /* By default, kprobes are enabled */
+ kprobe_enabled = true;
+
+ err = arch_init_kprobes();
+ if (!err)
+ err = register_die_notifier(&kprobe_exceptions_nb);
+ kprobes_initialized = (err == 0);
+
+ if (!err)
+ init_test_probes();
+ return err;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+ const char *sym, int offset,char *modname)
+{
+ char *kprobe_type;
+
+ if (p->pre_handler == pre_handler_kretprobe)
+ kprobe_type = "r";
+ else if (p->pre_handler == setjmp_pre_handler)
+ kprobe_type = "j";
+ else
+ kprobe_type = "k";
+ if (sym)
+ seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type,
+ sym, offset, (modname ? modname : " "));
+ else
+ seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr);
+}
+
+static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+{
+ return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
+}
+
+static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= KPROBE_TABLE_SIZE)
+ return NULL;
+ return pos;
+}
+
+static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p, *kp;
+ const char *sym = NULL;
+ unsigned int i = *(loff_t *) v;
+ unsigned long offset = 0;
+ char *modname, namebuf[128];
+
+ head = &kprobe_table[i];
+ preempt_disable();
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ sym = kallsyms_lookup((unsigned long)p->addr, NULL,
+ &offset, &modname, namebuf);
+ if (p->pre_handler == aggr_pre_handler) {
+ list_for_each_entry_rcu(kp, &p->list, list)
+ report_probe(pi, kp, sym, offset, modname);
+ } else
+ report_probe(pi, p, sym, offset, modname);
+ }
+ preempt_enable();
+ return 0;
+}
+
+static struct seq_operations kprobes_seq_ops = {
+ .start = kprobe_seq_start,
+ .next = kprobe_seq_next,
+ .stop = kprobe_seq_stop,
+ .show = show_kprobe_addr
+};
+
+static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobes_seq_ops);
+}
+
+static struct file_operations debugfs_kprobes_operations = {
+ .open = kprobes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void __kprobes enable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already enabled, just return */
+ if (kprobe_enabled)
+ goto already_enabled;
+
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist)
+ arch_arm_kprobe(p);
+ }
+
+ kprobe_enabled = true;
+ printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+static void __kprobes disable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already disabled, just return */
+ if (!kprobe_enabled)
+ goto already_disabled;
+
+ kprobe_enabled = false;
+ printk(KERN_INFO "Kprobes globally disabled\n");
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (!arch_trampoline_kprobe(p))
+ arch_disarm_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+ /* Allow all currently running kprobes to complete */
+ synchronize_sched();
+ return;
+
+already_disabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+ char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[3];
+
+ if (kprobe_enabled)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ int buf_size;
+
+ buf_size = min(count, (sizeof(buf)-1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ switch (buf[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ enable_all_kprobes();
+ break;
+ case 'n':
+ case 'N':
+ case '0':
+ disable_all_kprobes();
+ break;
+ }
+
+ return count;
+}
+
+static struct file_operations fops_kp = {
+ .read = read_enabled_file_bool,
+ .write = write_enabled_file_bool,
+};
+
+static int __kprobes debugfs_kprobe_init(void)
+{
+ struct dentry *dir, *file;
+ unsigned int value = 1;
+
+ dir = debugfs_create_dir("kprobes", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ file = debugfs_create_file("list", 0444, dir, NULL,
+ &debugfs_kprobes_operations);
+ if (!file) {
+ debugfs_remove(dir);
+ return -ENOMEM;
+ }
+
+ file = debugfs_create_file("enabled", 0600, dir,
+ &value, &fops_kp);
+ if (!file) {
+ debugfs_remove(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(debugfs_kprobe_init);
+#endif /* CONFIG_DEBUG_FS */
+
+module_init(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+EXPORT_SYMBOL_GPL(register_kprobes);
+EXPORT_SYMBOL_GPL(unregister_kprobes);
+EXPORT_SYMBOL_GPL(register_jprobe);
+EXPORT_SYMBOL_GPL(unregister_jprobe);
+EXPORT_SYMBOL_GPL(register_jprobes);
+EXPORT_SYMBOL_GPL(unregister_jprobes);
+EXPORT_SYMBOL_GPL(jprobe_return);
+EXPORT_SYMBOL_GPL(register_kretprobe);
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+EXPORT_SYMBOL_GPL(register_kretprobes);
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
new file mode 100644
index 0000000..08dd8ed
--- /dev/null
+++ b/kernel/ksysfs.c
@@ -0,0 +1,197 @@
+/*
+ * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
+ * are not related to any other subsystem
+ *
+ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
+ *
+ * This file is release under the GPLv2
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kexec.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+/* current uevent sequence number */
+static ssize_t uevent_seqnum_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+}
+KERNEL_ATTR_RO(uevent_seqnum);
+
+/* uevent helper program, used during early boo */
+static ssize_t uevent_helper_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", uevent_helper);
+}
+static ssize_t uevent_helper_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (count+1 > UEVENT_HELPER_PATH_LEN)
+ return -ENOENT;
+ memcpy(uevent_helper, buf, count);
+ uevent_helper[count] = '\0';
+ if (count && uevent_helper[count-1] == '\n')
+ uevent_helper[count-1] = '\0';
+ return count;
+}
+KERNEL_ATTR_RW(uevent_helper);
+#endif
+
+#ifdef CONFIG_PROFILING
+static ssize_t profiling_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", prof_on);
+}
+static ssize_t profiling_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+
+ if (prof_on)
+ return -EEXIST;
+ /*
+ * This eventually calls into get_option() which
+ * has a ton of callers and is not const. It is
+ * easiest to cast it away here.
+ */
+ profile_setup((char *)buf);
+ ret = profile_init();
+ if (ret)
+ return ret;
+ ret = create_proc_profile();
+ if (ret)
+ return ret;
+ return count;
+}
+KERNEL_ATTR_RW(profiling);
+#endif
+
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+
+static ssize_t vmcoreinfo_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lx %x\n",
+ paddr_vmcoreinfo_note(),
+ (unsigned int)vmcoreinfo_max_size);
+}
+KERNEL_ATTR_RO(vmcoreinfo);
+
+#endif /* CONFIG_KEXEC */
+
+/*
+ * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
+ */
+extern const void __start_notes __attribute__((weak));
+extern const void __stop_notes __attribute__((weak));
+#define notes_size (&__stop_notes - &__start_notes)
+
+static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, &__start_notes + off, count);
+ return count;
+}
+
+static struct bin_attribute notes_attr = {
+ .attr = {
+ .name = "notes",
+ .mode = S_IRUGO,
+ },
+ .read = &notes_read,
+};
+
+struct kobject *kernel_kobj;
+EXPORT_SYMBOL_GPL(kernel_kobj);
+
+static struct attribute * kernel_attrs[] = {
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+ &uevent_seqnum_attr.attr,
+ &uevent_helper_attr.attr,
+#endif
+#ifdef CONFIG_PROFILING
+ &profiling_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+ &kexec_loaded_attr.attr,
+ &kexec_crash_loaded_attr.attr,
+ &vmcoreinfo_attr.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group kernel_attr_group = {
+ .attrs = kernel_attrs,
+};
+
+static int __init ksysfs_init(void)
+{
+ int error;
+
+ kernel_kobj = kobject_create_and_add("kernel", NULL);
+ if (!kernel_kobj) {
+ error = -ENOMEM;
+ goto exit;
+ }
+ error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
+ if (error)
+ goto kset_exit;
+
+ if (notes_size > 0) {
+ notes_attr.size = notes_size;
+ error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
+ if (error)
+ goto group_exit;
+ }
+
+ /* create the /sys/kernel/uids/ directory */
+ error = uids_sysfs_init();
+ if (error)
+ goto notes_exit;
+
+ return 0;
+
+notes_exit:
+ if (notes_size > 0)
+ sysfs_remove_bin_file(kernel_kobj, &notes_attr);
+group_exit:
+ sysfs_remove_group(kernel_kobj, &kernel_attr_group);
+kset_exit:
+ kobject_put(kernel_kobj);
+exit:
+ return error;
+}
+
+core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
new file mode 100644
index 0000000..8e7a7ce
--- /dev/null
+++ b/kernel/kthread.c
@@ -0,0 +1,267 @@
+/* Kernel thread helper functions.
+ * Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via kthreadd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <trace/sched.h>
+
+#define KTHREAD_NICE_LEVEL (-5)
+
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
+
+struct kthread_create_info
+{
+ /* Information passed to kthread() from kthreadd. */
+ int (*threadfn)(void *data);
+ void *data;
+ struct completion started;
+
+ /* Result passed back to kthread_create() from kthreadd. */
+ struct task_struct *result;
+ struct completion done;
+
+ struct list_head list;
+};
+
+struct kthread_stop_info
+{
+ struct task_struct *k;
+ int err;
+ struct completion done;
+};
+
+/* Thread stopping is done by setthing this var: lock serializes
+ * multiple kthread_stop calls. */
+static DEFINE_MUTEX(kthread_stop_lock);
+static struct kthread_stop_info kthread_stop_info;
+
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop() on your kthread, it will be woken
+ * and this will return true. You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
+int kthread_should_stop(void)
+{
+ return (kthread_stop_info.k == current);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+static int kthread(void *_create)
+{
+ struct kthread_create_info *create = _create;
+ int (*threadfn)(void *data);
+ void *data;
+ int ret = -EINTR;
+
+ /* Copy data: it's on kthread's stack */
+ threadfn = create->threadfn;
+ data = create->data;
+
+ /* OK, tell user we're spawned, wait for stop or wakeup */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ complete(&create->started);
+ schedule();
+
+ if (!kthread_should_stop())
+ ret = threadfn(data);
+
+ /* It might have exited on its own, w/o kthread_stop. Check. */
+ if (kthread_should_stop()) {
+ kthread_stop_info.err = ret;
+ complete(&kthread_stop_info.done);
+ }
+ return 0;
+}
+
+static void create_kthread(struct kthread_create_info *create)
+{
+ int pid;
+
+ /* We want our own signal handler (we take no signals by default). */
+ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ if (pid < 0) {
+ create->result = ERR_PTR(pid);
+ } else {
+ struct sched_param param = { .sched_priority = 0 };
+ wait_for_completion(&create->started);
+ read_lock(&tasklist_lock);
+ create->result = find_task_by_pid_ns(pid, &init_pid_ns);
+ read_unlock(&tasklist_lock);
+ /*
+ * root may have changed our (kthreadd's) priority or CPU mask.
+ * The kernel thread should not inherit these properties.
+ */
+ sched_setscheduler(create->result, SCHED_NORMAL, &param);
+ set_user_nice(create->result, KTHREAD_NICE_LEVEL);
+ set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+ }
+ complete(&create->done);
+}
+
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+ void *data,
+ const char namefmt[],
+ ...)
+{
+ struct kthread_create_info create;
+
+ create.threadfn = threadfn;
+ create.data = data;
+ init_completion(&create.started);
+ init_completion(&create.done);
+
+ spin_lock(&kthread_create_lock);
+ list_add_tail(&create.list, &kthread_create_list);
+ spin_unlock(&kthread_create_lock);
+
+ wake_up_process(kthreadd_task);
+ wait_for_completion(&create.done);
+
+ if (!IS_ERR(create.result)) {
+ va_list args;
+ va_start(args, namefmt);
+ vsnprintf(create.result->comm, sizeof(create.result->comm),
+ namefmt, args);
+ va_end(args);
+ }
+ return create.result;
+}
+EXPORT_SYMBOL(kthread_create);
+
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ */
+void kthread_bind(struct task_struct *k, unsigned int cpu)
+{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
+ WARN_ON(1);
+ return;
+ }
+ set_task_cpu(k, cpu);
+ k->cpus_allowed = cpumask_of_cpu(cpu);
+ k->rt.nr_cpus_allowed = 1;
+ k->flags |= PF_THREAD_BOUND;
+}
+EXPORT_SYMBOL(kthread_bind);
+
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit. Your threadfn() must not call do_exit()
+ * itself if you use this function! This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
+int kthread_stop(struct task_struct *k)
+{
+ int ret;
+
+ mutex_lock(&kthread_stop_lock);
+
+ /* It could exit after stop_info.k set, but before wake_up_process. */
+ get_task_struct(k);
+
+ trace_sched_kthread_stop(k);
+
+ /* Must init completion *before* thread sees kthread_stop_info.k */
+ init_completion(&kthread_stop_info.done);
+ smp_wmb();
+
+ /* Now set kthread_should_stop() to true, and wake it up. */
+ kthread_stop_info.k = k;
+ wake_up_process(k);
+ put_task_struct(k);
+
+ /* Once it dies, reset stop ptr, gather result and we're done. */
+ wait_for_completion(&kthread_stop_info.done);
+ kthread_stop_info.k = NULL;
+ ret = kthread_stop_info.err;
+ mutex_unlock(&kthread_stop_lock);
+
+ trace_sched_kthread_stop_ret(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthreadd(void *unused)
+{
+ struct task_struct *tsk = current;
+
+ /* Setup a clean context for our children to inherit. */
+ set_task_comm(tsk, "kthreadd");
+ ignore_signals(tsk);
+ set_user_nice(tsk, KTHREAD_NICE_LEVEL);
+ set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+
+ current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&kthread_create_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ spin_lock(&kthread_create_lock);
+ while (!list_empty(&kthread_create_list)) {
+ struct kthread_create_info *create;
+
+ create = list_entry(kthread_create_list.next,
+ struct kthread_create_info, list);
+ list_del_init(&create->list);
+ spin_unlock(&kthread_create_lock);
+
+ create_kthread(create);
+
+ spin_lock(&kthread_create_lock);
+ }
+ spin_unlock(&kthread_create_lock);
+ }
+
+ return 0;
+}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 0000000..449db46
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (!latencytop_enabled)
+ return;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ memset(&p->latency_record, 0, sizeof(p->latency_record));
+ p->latency_record_count = 0;
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ memset(&latency_record, 0, sizeof(latency_record));
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+ int firstnonnull = MAXLR + 1;
+ int i;
+
+ if (!latencytop_enabled)
+ return;
+
+ /* skip kernel threads for now */
+ if (!tsk->mm)
+ return;
+
+ for (i = 0; i < MAXLR; i++) {
+ int q, same = 1;
+
+ /* Nothing stored: */
+ if (!latency_record[i].backtrace[0]) {
+ if (firstnonnull > i)
+ firstnonnull = i;
+ continue;
+ }
+ for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ unsigned long record = lat->backtrace[q];
+
+ if (latency_record[i].backtrace[q] != record) {
+ same = 0;
+ break;
+ }
+
+ /* 0 and ULONG_MAX entries mean end of backtrace: */
+ if (record == 0 || record == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ latency_record[i].count++;
+ latency_record[i].time += lat->time;
+ if (lat->time > latency_record[i].max)
+ latency_record[i].max = lat->time;
+ return;
+ }
+ }
+
+ i = firstnonnull;
+ if (i >= MAXLR - 1)
+ return;
+
+ /* Allocted a new one: */
+ memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ trace.max_entries = LT_BACKTRACEDEPTH;
+ trace.entries = &lat->backtrace[0];
+ trace.skip = 0;
+ save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+ unsigned long flags;
+ int i, q;
+ struct latency_record lat;
+
+ if (!latencytop_enabled)
+ return;
+
+ /* Long interruptible waits are generally user requested... */
+ if (inter && usecs > 5000)
+ return;
+
+ memset(&lat, 0, sizeof(lat));
+ lat.count = 1;
+ lat.time = usecs;
+ lat.max = usecs;
+ store_stacktrace(tsk, &lat);
+
+ spin_lock_irqsave(&latency_lock, flags);
+
+ account_global_scheduler_latency(tsk, &lat);
+
+ /*
+ * short term hack; if we're > 32 we stop; future we recycle:
+ */
+ tsk->latency_record_count++;
+ if (tsk->latency_record_count >= LT_SAVECOUNT)
+ goto out_unlock;
+
+ for (i = 0; i < LT_SAVECOUNT ; i++) {
+ struct latency_record *mylat;
+ int same = 1;
+
+ mylat = &tsk->latency_record[i];
+ for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ unsigned long record = lat.backtrace[q];
+
+ if (mylat->backtrace[q] != record) {
+ same = 0;
+ break;
+ }
+
+ /* 0 and ULONG_MAX entries mean end of backtrace: */
+ if (record == 0 || record == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ mylat->count++;
+ mylat->time += lat.time;
+ if (lat.time > mylat->max)
+ mylat->max = lat.time;
+ goto out_unlock;
+ }
+ }
+
+ /* Allocated a new one: */
+ i = tsk->latency_record_count;
+ memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "Latency Top version : v0.1\n");
+
+ for (i = 0; i < MAXLR; i++) {
+ if (latency_record[i].backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %li %li ",
+ latency_record[i].count,
+ latency_record[i].time,
+ latency_record[i].max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ char sym[KSYM_SYMBOL_LEN];
+ char *c;
+ if (!latency_record[i].backtrace[q])
+ break;
+ if (latency_record[i].backtrace[q] == ULONG_MAX)
+ break;
+ sprint_symbol(sym, latency_record[i].backtrace[q]);
+ c = strchr(sym, '+');
+ if (c)
+ *c = 0;
+ seq_printf(m, "%s ", sym);
+ }
+ seq_printf(m, "\n");
+ }
+ }
+ return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *offs)
+{
+ clear_global_latency_tracing();
+
+ return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+ proc_create("latency_stats", 0644, NULL, &lstats_fops);
+ return 0;
+}
+__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 0000000..46a4041
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,3482 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+#include <linux/utsname.h>
+#include <linux/hash.h>
+#include <linux/ftrace.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+#ifdef CONFIG_PROVE_LOCKING
+int prove_locking = 1;
+module_param(prove_locking, int, 0644);
+#else
+#define prove_locking 0
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+int lock_stat = 1;
+module_param(lock_stat, int, 0644);
+#else
+#define lock_stat 0
+#endif
+
+/*
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ * class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code...
+ */
+static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int graph_lock(void)
+{
+ __raw_spin_lock(&lockdep_lock);
+ /*
+ * Make sure that if another CPU detected a bug while
+ * walking the graph we dont change it (while the other
+ * CPU is busy printing out stuff with the graph lock
+ * dropped already)
+ */
+ if (!debug_locks) {
+ __raw_spin_unlock(&lockdep_lock);
+ return 0;
+ }
+ /* prevent any recursions within lockdep from causing deadlocks */
+ current->lockdep_recursion++;
+ return 1;
+}
+
+static inline int graph_unlock(void)
+{
+ if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
+ return DEBUG_LOCKS_WARN_ON(1);
+
+ current->lockdep_recursion--;
+ __raw_spin_unlock(&lockdep_lock);
+ return 0;
+}
+
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+ int ret = debug_locks_off();
+
+ __raw_spin_unlock(&lockdep_lock);
+
+ return ret;
+}
+
+static int lockdep_initialized;
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+ if (!hlock->class_idx) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return NULL;
+ }
+ return lock_classes + hlock->class_idx - 1;
+}
+
+#ifdef CONFIG_LOCK_STAT
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+
+static int lock_contention_point(struct lock_class *class, unsigned long ip)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+ if (class->contention_point[i] == 0) {
+ class->contention_point[i] = ip;
+ break;
+ }
+ if (class->contention_point[i] == ip)
+ break;
+ }
+
+ return i;
+}
+
+static void lock_time_inc(struct lock_time *lt, s64 time)
+{
+ if (time > lt->max)
+ lt->max = time;
+
+ if (time < lt->min || !lt->min)
+ lt->min = time;
+
+ lt->total += time;
+ lt->nr++;
+}
+
+static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
+{
+ dst->min += src->min;
+ dst->max += src->max;
+ dst->total += src->total;
+ dst->nr += src->nr;
+}
+
+struct lock_class_stats lock_stats(struct lock_class *class)
+{
+ struct lock_class_stats stats;
+ int cpu, i;
+
+ memset(&stats, 0, sizeof(struct lock_class_stats));
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *pcs =
+ &per_cpu(lock_stats, cpu)[class - lock_classes];
+
+ for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
+ stats.contention_point[i] += pcs->contention_point[i];
+
+ lock_time_add(&pcs->read_waittime, &stats.read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+
+ lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+
+ for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
+ stats.bounces[i] += pcs->bounces[i];
+ }
+
+ return stats;
+}
+
+void clear_lock_stats(struct lock_class *class)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *cpu_stats =
+ &per_cpu(lock_stats, cpu)[class - lock_classes];
+
+ memset(cpu_stats, 0, sizeof(struct lock_class_stats));
+ }
+ memset(class->contention_point, 0, sizeof(class->contention_point));
+}
+
+static struct lock_class_stats *get_lock_stats(struct lock_class *class)
+{
+ return &get_cpu_var(lock_stats)[class - lock_classes];
+}
+
+static void put_lock_stats(struct lock_class_stats *stats)
+{
+ put_cpu_var(lock_stats);
+}
+
+static void lock_release_holdtime(struct held_lock *hlock)
+{
+ struct lock_class_stats *stats;
+ s64 holdtime;
+
+ if (!lock_stat)
+ return;
+
+ holdtime = sched_clock() - hlock->holdtime_stamp;
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (hlock->read)
+ lock_time_inc(&stats->read_holdtime, holdtime);
+ else
+ lock_time_inc(&stats->write_holdtime, holdtime);
+ put_lock_stats(stats);
+}
+#else
+static inline void lock_release_holdtime(struct held_lock *hlock)
+{
+}
+#endif
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
+#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
+#define classhashentry(key) (classhash_table + __classhashfn((key)))
+
+static struct list_head classhash_table[CLASSHASH_SIZE];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
+#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
+#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
+
+static struct list_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+#define iterate_chain_key(key1, key2) \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
+ (key2))
+
+void lockdep_off(void)
+{
+ current->lockdep_recursion++;
+}
+
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+ current->lockdep_recursion--;
+}
+
+EXPORT_SYMBOL(lockdep_on);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE 0
+#define VERY_VERBOSE 0
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE 1
+# define SOFTIRQ_VERBOSE 1
+#else
+# define HARDIRQ_VERBOSE 0
+# define SOFTIRQ_VERBOSE 0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+#if 0
+ /* Example */
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "lockname"))
+ return 1;
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "&struct->lockfield"))
+ return 1;
+#endif
+ /* Filter everything else. 1 would be to allow everything else */
+ return 0;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static int save_trace(struct stack_trace *trace)
+{
+ trace->nr_entries = 0;
+ trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->entries = stack_trace + nr_stack_trace_entries;
+
+ trace->skip = 3;
+
+ save_stack_trace(trace);
+
+ trace->max_entries = trace->nr_entries;
+
+ nr_stack_trace_entries += trace->nr_entries;
+
+ if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+unsigned int max_recursion_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static bool lockdep_dependency_visit(struct lock_class *source,
+ unsigned int depth)
+{
+ if (!depth)
+ lockdep_dependency_gen_id++;
+ if (source->dep_gen_id == lockdep_dependency_gen_id)
+ return true;
+ source->dep_gen_id = lockdep_dependency_gen_id;
+ return false;
+}
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * We cannot printk in early bootup code. Not even early_printk()
+ * might work. So we mark any initialization errors and printk
+ * about it later on, in lockdep_info().
+ */
+static int lockdep_init_error;
+static unsigned long lockdep_init_trace_data[20];
+static struct stack_trace lockdep_init_trace = {
+ .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
+ .entries = lockdep_init_trace_data,
+};
+
+/*
+ * Various lockdep statistics:
+ */
+atomic_t chain_lookup_hits;
+atomic_t chain_lookup_misses;
+atomic_t hardirqs_on_events;
+atomic_t hardirqs_off_events;
+atomic_t redundant_hardirqs_on;
+atomic_t redundant_hardirqs_off;
+atomic_t softirqs_on_events;
+atomic_t softirqs_off_events;
+atomic_t redundant_softirqs_on;
+atomic_t redundant_softirqs_off;
+atomic_t nr_unused_locks;
+atomic_t nr_cyclic_checks;
+atomic_t nr_cyclic_check_recursions;
+atomic_t nr_find_usage_forwards_checks;
+atomic_t nr_find_usage_forwards_recursions;
+atomic_t nr_find_usage_backwards_checks;
+atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+static const char *usage_str[] =
+{
+ [LOCK_USED] = "initial-use ",
+ [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
+ [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
+ [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
+ [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
+ [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
+ [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
+ [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
+ [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
+}
+
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+{
+ *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ *c1 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ *c1 = '-';
+
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ *c2 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ *c2 = '-';
+
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
+ *c3 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '?';
+ }
+
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
+ *c4 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '?';
+ }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN], c1, c2, c3, c4;
+ const char *name;
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ printk(" (%s", name);
+ } else {
+ printk(" (%s", name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ if (class->subclass)
+ printk("/%d", class->subclass);
+ }
+ printk("){%c%c%c%c}", c1, c2, c3, c4);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+ const char *name;
+ char str[KSYM_NAME_LEN];
+
+ name = lock->name;
+ if (!name)
+ name = __get_key_name(lock->key->subkeys, str);
+
+ printk("%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+ print_lock_name(hlock_class(hlock));
+ printk(", at: ");
+ print_ip_sym(hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *curr)
+{
+ int i, depth = curr->lockdep_depth;
+
+ if (!depth) {
+ printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ return;
+ }
+ printk("%d lock%s held by %s/%d:\n",
+ depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
+
+ for (i = 0; i < depth; i++) {
+ printk(" #%d: ", i);
+ print_lock(curr->held_locks + i);
+ }
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
+
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
+
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
+ }
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk all lock dependencies starting at <entry>:
+ */
+static void print_lock_dependencies(struct lock_class *class, int depth)
+{
+ struct lock_list *entry;
+
+ if (lockdep_dependency_visit(class, depth))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(depth >= 20))
+ return;
+
+ print_lock_class_header(class, depth);
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (DEBUG_LOCKS_WARN_ON(!entry->class))
+ return;
+
+ print_lock_dependencies(entry->class, depth + 1);
+
+ printk("%*s ... acquired at:\n",depth,"");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+ }
+}
+
+static void print_kernel_version(void)
+{
+ printk("%s %.*s\n", init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Is this the address of a static object:
+ */
+static int static_obj(void *obj)
+{
+ unsigned long start = (unsigned long) &_stext,
+ end = (unsigned long) &_end,
+ addr = (unsigned long) obj;
+#ifdef CONFIG_SMP
+ int i;
+#endif
+
+ /*
+ * static variable?
+ */
+ if ((addr >= start) && (addr < end))
+ return 1;
+
+#ifdef CONFIG_SMP
+ /*
+ * percpu var?
+ */
+ for_each_possible_cpu(i) {
+ start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+ end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+ + per_cpu_offset(i);
+
+ if ((addr >= start) && (addr < end))
+ return 1;
+ }
+#endif
+
+ /*
+ * module var?
+ */
+ return is_module_address(addr);
+}
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+ struct lock_class *class;
+ int count = 0;
+
+ if (!new_class->name)
+ return 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ if (new_class->key - new_class->subclass == class->key)
+ return class->name_version;
+ if (class->name && !strcmp(class->name, new_class->name))
+ count = max(count, class->name_version);
+ }
+
+ return count + 1;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * If the architecture calls into lockdep before initializing
+ * the hashes then we'll warn about it later. (we cannot printk
+ * right now)
+ */
+ if (unlikely(!lockdep_initialized)) {
+ lockdep_init();
+ lockdep_init_error = 1;
+ save_stack_trace(&lockdep_init_trace);
+ }
+#endif
+
+ /*
+ * Static locks do not have their class-keys yet - for them the key
+ * is the lock object itself:
+ */
+ if (unlikely(!lock->key))
+ lock->key = (void *)lock;
+
+ /*
+ * NOTE: the class-key must be unique. For dynamic locks, a static
+ * lock_class_key variable is passed in through the mutex_init()
+ * (or spin_lock_init()) call - which acts as the key. For static
+ * locks we use the lock object itself as the key.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) >
+ sizeof(struct lockdep_map));
+
+ key = lock->key->subkeys + subclass;
+
+ hash_head = classhashentry(key);
+
+ /*
+ * We can walk the hash lockfree, because the hash only
+ * grows, and we are careful when adding entries to the end:
+ */
+ list_for_each_entry(class, hash_head, hash_entry) {
+ if (class->key == key) {
+ WARN_ON_ONCE(class->name != lock->name);
+ return class;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+ unsigned long flags;
+
+ class = look_up_lock_class(lock, subclass);
+ if (likely(class))
+ return class;
+
+ /*
+ * Debug-check: all keys must be persistent!
+ */
+ if (!static_obj(lock->key)) {
+ debug_locks_off();
+ printk("INFO: trying to register non-static key.\n");
+ printk("the code is fine but needs lockdep annotation.\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
+ return NULL;
+ }
+
+ key = lock->key->subkeys + subclass;
+ hash_head = classhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
+ /*
+ * We have to do the hash-walk again, to avoid races
+ * with another CPU:
+ */
+ list_for_each_entry(class, hash_head, hash_entry)
+ if (class->key == key)
+ goto out_unlock_set;
+ /*
+ * Allocate a new key from the static array, and add it to
+ * the hash:
+ */
+ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ if (!debug_locks_off_graph_unlock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
+ raw_local_irq_restore(flags);
+
+ printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ class = lock_classes + nr_lock_classes++;
+ debug_atomic_inc(&nr_unused_locks);
+ class->key = key;
+ class->name = lock->name;
+ class->subclass = subclass;
+ INIT_LIST_HEAD(&class->lock_entry);
+ INIT_LIST_HEAD(&class->locks_before);
+ INIT_LIST_HEAD(&class->locks_after);
+ class->name_version = count_matching_names(class);
+ /*
+ * We use RCU's safe list-add method to make
+ * parallel walking of the hash-list safe:
+ */
+ list_add_tail_rcu(&class->hash_entry, hash_head);
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+
+ if (verbose(class)) {
+ graph_unlock();
+ raw_local_irq_restore(flags);
+
+ printk("\nnew class %p: %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+
+ raw_local_irq_save(flags);
+ if (!graph_lock()) {
+ raw_local_irq_restore(flags);
+ return NULL;
+ }
+ }
+out_unlock_set:
+ graph_unlock();
+ raw_local_irq_restore(flags);
+
+ if (!subclass || force)
+ lock->class_cache = class;
+
+ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
+ return NULL;
+
+ return class;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+ if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ if (!debug_locks_off_graph_unlock())
+ return NULL;
+
+ printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ return list_entries + nr_list_entries++;
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+ struct list_head *head, unsigned long ip, int distance)
+{
+ struct lock_list *entry;
+ /*
+ * Lock not present yet - get a new dependency struct and
+ * add it to the list:
+ */
+ entry = alloc_list_entry();
+ if (!entry)
+ return 0;
+
+ if (!save_trace(&entry->trace))
+ return 0;
+
+ entry->class = this;
+ entry->distance = distance;
+ /*
+ * Since we never remove from the dependency list, the list can
+ * be walked lockless by other CPUs, it's only allocation
+ * that must be protected by the spinlock. But this also means
+ * we must make new entries visible only once writes to the
+ * entry become visible - hence the RCU op:
+ */
+ list_add_tail_rcu(&entry->entry, head);
+
+ return 1;
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ *
+ * (to keep the stackframe of the recursive functions small we
+ * use these global variables, and we also mark various helper
+ * functions as noinline.)
+ */
+static struct held_lock *check_source, *check_target;
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+{
+ if (debug_locks_silent)
+ return 0;
+ printk("\n-> #%u", depth);
+ print_lock_name(target->class);
+ printk(":\n");
+ print_stack_trace(&target->trace, 6);
+
+ return 0;
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+{
+ struct task_struct *curr = current;
+
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n=======================================================\n");
+ printk( "[ INFO: possible circular locking dependency detected ]\n");
+ print_kernel_version();
+ printk( "-------------------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(check_source);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(check_target);
+ printk("\nwhich lock already depends on the new lock.\n\n");
+ printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+ print_circular_bug_entry(entry, depth);
+
+ return 0;
+}
+
+static noinline int print_circular_bug_tail(void)
+{
+ struct task_struct *curr = current;
+ struct lock_list this;
+
+ if (debug_locks_silent)
+ return 0;
+
+ this.class = hlock_class(check_source);
+ if (!save_trace(&this.trace))
+ return 0;
+
+ print_circular_bug_entry(&this, 0);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+#define RECURSION_LIMIT 40
+
+static int noinline print_infinite_recursion_bug(void)
+{
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ WARN_ON(1);
+
+ return 0;
+}
+
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+ unsigned int depth)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ if (lockdep_dependency_visit(class, depth))
+ return 0;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_after, entry)
+ ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+
+ return ret;
+}
+
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_forward_deps(class, 0);
+ __raw_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+ unsigned int depth)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ if (lockdep_dependency_visit(class, depth))
+ return 0;
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_before, entry)
+ ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+
+ return ret;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_backward_deps(class, 0);
+ __raw_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
+ debug_atomic_inc(&nr_cyclic_check_recursions);
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= RECURSION_LIMIT)
+ return print_infinite_recursion_bug();
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ if (entry->class == hlock_class(check_target))
+ return print_circular_bug_header(entry, depth+1);
+ debug_atomic_inc(&nr_cyclic_checks);
+ if (!check_noncircular(entry->class, depth+1))
+ return print_circular_bug_entry(entry, depth+1);
+ }
+ return 1;
+}
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+static enum lock_usage_bit find_usage_bit;
+static struct lock_class *forwards_match, *backwards_match;
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <forwards_match>.
+ *
+ * Return 1 otherwise and keep <forwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_forwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= RECURSION_LIMIT)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_forwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ forwards_match = source;
+ return 2;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ debug_atomic_inc(&nr_find_usage_forwards_recursions);
+ ret = find_usage_forwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <backwards_match>.
+ *
+ * Return 1 otherwise and keep <backwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_backwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
+ if (!__raw_spin_is_locked(&lockdep_lock))
+ return DEBUG_LOCKS_WARN_ON(1);
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= RECURSION_LIMIT)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_backwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ backwards_match = source;
+ return 2;
+ }
+
+ if (!source && debug_locks_off_graph_unlock()) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_before, entry) {
+ debug_atomic_inc(&nr_find_usage_backwards_recursions);
+ ret = find_usage_backwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+ struct held_lock *prev,
+ struct held_lock *next,
+ enum lock_usage_bit bit1,
+ enum lock_usage_bit bit2,
+ const char *irqclass)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n======================================================\n");
+ printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ irqclass, irqclass);
+ print_kernel_version();
+ printk( "------------------------------------------------------\n");
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ curr->comm, task_pid_nr(curr),
+ curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+ curr->hardirqs_enabled,
+ curr->softirqs_enabled);
+ print_lock(next);
+
+ printk("\nand this task is already holding:\n");
+ print_lock(prev);
+ printk("which would create a new lock dependency:\n");
+ print_lock_name(hlock_class(prev));
+ printk(" ->");
+ print_lock_name(hlock_class(next));
+ printk("\n");
+
+ printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ irqclass);
+ print_lock_name(backwards_match);
+ printk("\n... which became %s-irq-safe at:\n", irqclass);
+
+ print_stack_trace(backwards_match->usage_traces + bit1, 1);
+
+ printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(forwards_match);
+ printk("\n... which became %s-irq-unsafe at:\n", irqclass);
+ printk("...");
+
+ print_stack_trace(forwards_match->usage_traces + bit2, 1);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(backwards_match, 0);
+
+ printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(forwards_match, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit_backwards,
+ enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit_backwards;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(hlock_class(prev), 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ find_usage_bit = bit_forwards;
+ ret = find_usage_forwards(hlock_class(next), 0);
+ if (!ret || ret == 1)
+ return ret;
+ /* ret == 2 */
+ return print_bad_irq_dependency(curr, prev, next,
+ bit_backwards, bit_forwards, irqclass);
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe-read
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
+ LOCK_ENABLED_HARDIRQS, "hard-read"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a softirq-safe
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ /*
+ * Prove that the new dependency does not connect a softirq-safe-read
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+
+ return 1;
+}
+
+static void inc_chains(void)
+{
+ if (current->hardirq_context)
+ nr_hardirq_chains++;
+ else {
+ if (current->softirq_context)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
+ }
+}
+
+#else
+
+static inline int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ return 1;
+}
+
+static inline void inc_chains(void)
+{
+ nr_process_chains++;
+}
+
+#endif
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n=============================================\n");
+ printk( "[ INFO: possible recursive locking detected ]\n");
+ print_kernel_version();
+ printk( "---------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(next);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(prev);
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+ struct lockdep_map *next_instance, int read)
+{
+ struct held_lock *prev;
+ struct held_lock *nest = NULL;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ prev = curr->held_locks + i;
+
+ if (prev->instance == next->nest_lock)
+ nest = prev;
+
+ if (hlock_class(prev) != hlock_class(next))
+ continue;
+
+ /*
+ * Allow read-after-read recursion of the same
+ * lock class (i.e. read_lock(lock)+read_lock(lock)):
+ */
+ if ((read == 2) && prev->read)
+ return 2;
+
+ /*
+ * We're holding the nest_lock, which serializes this lock's
+ * nesting behaviour.
+ */
+ if (nest)
+ return 2;
+
+ return print_deadlock_bug(curr, prev, next);
+ }
+ return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ * - would the adding of the <prev> -> <next> dependency create a
+ * circular dependency in the graph? [== circular deadlock]
+ *
+ * - does the new prev->next dependency connect any hardirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * hardirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ * - does the new prev->next dependency connect any softirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * softirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, int distance)
+{
+ struct lock_list *entry;
+ int ret;
+
+ /*
+ * Prove that the new <prev> -> <next> dependency would not
+ * create a circular dependency in the graph. (We do this by
+ * forward-recursing into the graph starting at <next>, and
+ * checking whether we can reach <prev>.)
+ *
+ * We are using global variables to control the recursion, to
+ * keep the stackframe size of the recursive functions low:
+ */
+ check_source = next;
+ check_target = prev;
+ if (!(check_noncircular(hlock_class(next), 0)))
+ return print_circular_bug_tail();
+
+ if (!check_prev_add_irq(curr, prev, next))
+ return 0;
+
+ /*
+ * For recursive read-locks we do all the dependency checks,
+ * but we dont store read-triggered dependencies (only
+ * write-triggered dependencies). This ensures that only the
+ * write-side dependencies matter, and that if for example a
+ * write-lock never takes any other locks, then the reads are
+ * equivalent to a NOP.
+ */
+ if (next->read == 2 || prev->read == 2)
+ return 1;
+ /*
+ * Is the <prev> -> <next> dependency already present?
+ *
+ * (this may occur even though this is a new chain: consider
+ * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+ * chains - the second one will be new, but L1 already has
+ * L2 added to its dependency list, due to the first chain.)
+ */
+ list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+ if (entry->class == hlock_class(next)) {
+ if (distance == 1)
+ entry->distance = 1;
+ return 2;
+ }
+ }
+
+ /*
+ * Ok, all validations passed, add the new lock
+ * to the previous lock's dependency list:
+ */
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ &hlock_class(prev)->locks_after,
+ next->acquire_ip, distance);
+
+ if (!ret)
+ return 0;
+
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ &hlock_class(next)->locks_before,
+ next->acquire_ip, distance);
+ if (!ret)
+ return 0;
+
+ /*
+ * Debugging printouts:
+ */
+ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ graph_unlock();
+ printk("\n new dependency: ");
+ print_lock_name(hlock_class(prev));
+ printk(" => ");
+ print_lock_name(hlock_class(next));
+ printk("\n");
+ dump_stack();
+ return graph_lock();
+ }
+ return 1;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+ int depth = curr->lockdep_depth;
+ struct held_lock *hlock;
+
+ /*
+ * Debugging checks.
+ *
+ * Depth must not be zero for a non-head lock:
+ */
+ if (!depth)
+ goto out_bug;
+ /*
+ * At least two relevant locks must exist for this
+ * to be a head:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ goto out_bug;
+
+ for (;;) {
+ int distance = curr->lockdep_depth - depth + 1;
+ hlock = curr->held_locks + depth-1;
+ /*
+ * Only non-recursive-read entries get new dependencies
+ * added:
+ */
+ if (hlock->read != 2) {
+ if (!check_prev_add(curr, hlock, next, distance))
+ return 0;
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
+ depth--;
+ /*
+ * End of lock-stack?
+ */
+ if (!depth)
+ break;
+ /*
+ * Stop the search if we cross into another context:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ break;
+ }
+ return 1;
+out_bug:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ WARN_ON(1);
+
+ return 0;
+}
+
+unsigned long nr_lock_chains;
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+int nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+ return lock_classes + chain_hlocks[chain->base + i];
+}
+
+/*
+ * Look up a dependency chain. If the key is not present yet then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct list_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ struct held_lock *hlock_curr, *hlock_next;
+ int i, j, n, cn;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+cache_hit:
+ debug_atomic_inc(&chain_lookup_hits);
+ if (very_verbose(class))
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ return 0;
+ }
+ }
+ if (very_verbose(class))
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+ if (!graph_lock())
+ return 0;
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ graph_unlock();
+ goto cache_hit;
+ }
+ }
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = hlock->irq_context;
+ /* Find the first held_lock of current chain */
+ hlock_next = hlock;
+ for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+ hlock_curr = curr->held_locks + i;
+ if (hlock_curr->irq_context != hlock_next->irq_context)
+ break;
+ hlock_next = hlock;
+ }
+ i++;
+ chain->depth = curr->lockdep_depth + 1 - i;
+ cn = nr_chain_hlocks;
+ while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
+ n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
+ if (n == cn)
+ break;
+ cn = n;
+ }
+ if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = cn;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = curr->held_locks[i].class_idx - 1;
+ chain_hlocks[chain->base + j] = lock_id;
+ }
+ chain_hlocks[chain->base + j] = class - lock_classes;
+ }
+ list_add_tail_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(&chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+ struct held_lock *hlock, int chain_head, u64 chain_key)
+{
+ /*
+ * Trylock needs to maintain the stack of held locks, but it
+ * does not add new dependencies, because trylock can be done
+ * in any order.
+ *
+ * We look up the chain_key and do the O(N^2) check and update of
+ * the dependencies only if this is a new dependency chain.
+ * (If lookup_chain_cache() returns with 1 it acquires
+ * graph_lock for us)
+ */
+ if (!hlock->trylock && (hlock->check == 2) &&
+ lookup_chain_cache(curr, hlock, chain_key)) {
+ /*
+ * Check whether last held lock:
+ *
+ * - is irq-safe, if this lock is irq-unsafe
+ * - is softirq-safe, if this lock is hardirq-unsafe
+ *
+ * And check whether the new lock's dependency graph
+ * could lead back to the previous lock.
+ *
+ * any of these scenarios could lead to a deadlock. If
+ * All validations
+ */
+ int ret = check_deadlock(curr, hlock, lock, hlock->read);
+
+ if (!ret)
+ return 0;
+ /*
+ * Mark recursive read, as we jump over it when
+ * building dependencies (just like we jump over
+ * trylock entries):
+ */
+ if (ret == 2)
+ hlock->read = 2;
+ /*
+ * Add dependency only if this lock is not the head
+ * of the chain, and if it's not a secondary read-lock:
+ */
+ if (!chain_head && ret != 2)
+ if (!check_prevs_add(curr, hlock))
+ return 0;
+ graph_unlock();
+ } else
+ /* after lookup_chain_cache(): */
+ if (unlikely(!debug_locks))
+ return 0;
+
+ return 1;
+}
+#else
+static inline int validate_chain(struct task_struct *curr,
+ struct lockdep_map *lock, struct held_lock *hlock,
+ int chain_head, u64 chain_key)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ struct held_lock *hlock, *prev_hlock = NULL;
+ unsigned int i, id;
+ u64 chain_key = 0;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+ if (chain_key != hlock->prev_chain_key) {
+ debug_locks_off();
+ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)hlock->prev_chain_key);
+ return;
+ }
+ id = hlock->class_idx - 1;
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return;
+
+ if (prev_hlock && (prev_hlock->irq_context !=
+ hlock->irq_context))
+ chain_key = 0;
+ chain_key = iterate_chain_key(chain_key, id);
+ prev_hlock = hlock;
+ }
+ if (chain_key != curr->curr_chain_key) {
+ debug_locks_off();
+ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)curr->curr_chain_key);
+ }
+#endif
+}
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n=================================\n");
+ printk( "[ INFO: inconsistent lock state ]\n");
+ print_kernel_version();
+ printk( "---------------------------------\n");
+
+ printk("inconsistent {%s} -> {%s} usage.\n",
+ usage_str[prev_bit], usage_str[new_bit]);
+
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ curr->comm, task_pid_nr(curr),
+ trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ trace_hardirqs_enabled(curr),
+ trace_softirqs_enabled(curr));
+ print_lock(this);
+
+ printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+
+ print_irqtrace_events(curr);
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
+ return print_usage_bug(curr, this, bad_bit, new_bit);
+ return 1;
+}
+
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+ struct held_lock *this, int forwards,
+ const char *irqclass)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n=========================================================\n");
+ printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ print_kernel_version();
+ printk( "---------------------------------------------------------\n");
+ printk("%s/%d just changed the state of lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(this);
+ if (forwards)
+ printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+ else
+ printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+ print_lock_name(other);
+ printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe first lock's dependencies:\n");
+ print_lock_dependencies(hlock_class(this), 0);
+
+ printk("\nthe second lock's dependencies:\n");
+ print_lock_dependencies(other, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <forwards_match> */
+ ret = find_usage_forwards(hlock_class(this), 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(hlock_class(this), 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+}
+
+void print_irqtrace_events(struct task_struct *curr)
+{
+ printk("irq event stamp: %u\n", curr->irq_events);
+ printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
+ print_ip_sym(curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
+ print_ip_sym(curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
+ print_ip_sym(curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
+ print_ip_sym(curr->softirq_disable_ip);
+}
+
+static int hardirq_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int softirq_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#define STRICT_READ_CHECKS 1
+
+static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ int ret = 1;
+
+ switch(new_bit) {
+ case LOCK_USED_IN_HARDIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_HARDIRQS_READ))
+ return 0;
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_SOFTIRQS_READ))
+ return 0;
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_HARDIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ /*
+ * just marked it hardirq-read-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+ if (hardirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ /*
+ * just marked it softirq-read-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ if (softirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ /*
+ * just marked it hardirq-unsafe, check that no hardirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-unsafe, check that no
+ * hardirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ /*
+ * just marked it softirq-unsafe, check that no softirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-unsafe, check that no
+ * softirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-read-unsafe, check that no
+ * hardirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#endif
+ if (hardirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-read-unsafe, check that no
+ * softirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#endif
+ if (softirq_verbose(hlock_class(this)))
+ ret = 2;
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, int hardirq)
+{
+ enum lock_usage_bit usage_bit;
+ struct held_lock *hlock;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (hardirq) {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_HARDIRQS;
+ } else {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_SOFTIRQS;
+ }
+ if (!mark_lock(curr, hlock, usage_bit))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Debugging helper: via this flag we know that we are in
+ * 'early bootup code', and will warn about any invalid irqs-on event:
+ */
+static int early_boot_irqs_enabled;
+
+void early_boot_irqs_off(void)
+{
+ early_boot_irqs_enabled = 0;
+}
+
+void early_boot_irqs_on(void)
+{
+ early_boot_irqs_enabled = 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+void trace_hardirqs_on_caller(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ time_hardirqs_on(CALLER_ADDR0, ip);
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ return;
+
+ if (unlikely(curr->hardirqs_enabled)) {
+ debug_atomic_inc(&redundant_hardirqs_on);
+ return;
+ }
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+ /*
+ * We are going to turn hardirqs on, so set the
+ * usage bit for all held locks:
+ */
+ if (!mark_held_locks(curr, 1))
+ return;
+ /*
+ * If we have softirqs enabled, then set the usage
+ * bit for all held locks. (disabled hardirqs prevented
+ * this bit from being set before)
+ */
+ if (curr->softirqs_enabled)
+ if (!mark_held_locks(curr, 0))
+ return;
+
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_on_events);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_on(void)
+{
+ trace_hardirqs_on_caller(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+void trace_hardirqs_off_caller(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ time_hardirqs_off(CALLER_ADDR0, ip);
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->hardirqs_enabled = 0;
+ curr->hardirq_disable_ip = ip;
+ curr->hardirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_off_events);
+ } else
+ debug_atomic_inc(&redundant_hardirqs_off);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+void trace_hardirqs_off(void)
+{
+ trace_hardirqs_off_caller(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ debug_atomic_inc(&redundant_softirqs_on);
+ return;
+ }
+
+ /*
+ * We'll do an OFF -> ON transition:
+ */
+ curr->softirqs_enabled = 1;
+ curr->softirq_enable_ip = ip;
+ curr->softirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_on_events);
+ /*
+ * We are going to turn softirqs on, so set the
+ * usage bit for all held locks, if hardirqs are
+ * enabled too:
+ */
+ if (curr->hardirqs_enabled)
+ mark_held_locks(curr, 0);
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->softirqs_enabled = 0;
+ curr->softirq_disable_ip = ip;
+ curr->softirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_off_events);
+ DEBUG_LOCKS_WARN_ON(!softirq_count());
+ } else
+ debug_atomic_inc(&redundant_softirqs_off);
+}
+
+static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+{
+ /*
+ * If non-trylock use in a hardirq or softirq context, then
+ * mark the lock as used in these contexts:
+ */
+ if (!hlock->trylock) {
+ if (hlock->read) {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ } else {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ }
+ }
+ if (!hlock->hardirqs_off) {
+ if (hlock->read) {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS_READ))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS_READ))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ unsigned int depth = curr->lockdep_depth;
+
+ /*
+ * Keep track of points where we cross into an interrupt context:
+ */
+ hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+ curr->softirq_context;
+ if (depth) {
+ struct held_lock *prev_hlock;
+
+ prev_hlock = curr->held_locks + depth-1;
+ /*
+ * If we cross into another context, reset the
+ * hash key (this also prevents the checking and the
+ * adding of the dependency to 'prev'):
+ */
+ if (prev_hlock->irq_context != hlock->irq_context)
+ return 1;
+ }
+ return 0;
+}
+
+#else
+
+static inline
+int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ WARN_ON(1);
+ return 1;
+}
+
+static inline int mark_irqflags(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 1;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ unsigned int new_mask = 1 << new_bit, ret = 1;
+
+ /*
+ * If already set then do not dirty the cacheline,
+ * nor do any checks:
+ */
+ if (likely(hlock_class(this)->usage_mask & new_mask))
+ return 1;
+
+ if (!graph_lock())
+ return 0;
+ /*
+ * Make sure we didnt race:
+ */
+ if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
+ graph_unlock();
+ return 1;
+ }
+
+ hlock_class(this)->usage_mask |= new_mask;
+
+ if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ return 0;
+
+ switch (new_bit) {
+ case LOCK_USED_IN_HARDIRQ:
+ case LOCK_USED_IN_SOFTIRQ:
+ case LOCK_USED_IN_HARDIRQ_READ:
+ case LOCK_USED_IN_SOFTIRQ_READ:
+ case LOCK_ENABLED_HARDIRQS:
+ case LOCK_ENABLED_SOFTIRQS:
+ case LOCK_ENABLED_HARDIRQS_READ:
+ case LOCK_ENABLED_SOFTIRQS_READ:
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
+ return 0;
+ break;
+ case LOCK_USED:
+ debug_atomic_dec(&nr_unused_locks);
+ break;
+ default:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+ WARN_ON(1);
+ return 0;
+ }
+
+ graph_unlock();
+
+ /*
+ * We must printk outside of the graph_lock:
+ */
+ if (ret == 2) {
+ printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+ print_lock(this);
+ print_irqtrace_events(curr);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!key))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(!name))
+ return;
+ /*
+ * Sanity check, the lock-class key must be persistent:
+ */
+ if (!static_obj(key)) {
+ printk("BUG: key %p not in .data!\n", key);
+ DEBUG_LOCKS_WARN_ON(1);
+ return;
+ }
+ lock->name = name;
+ lock->key = key;
+ lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+ lock->cpu = raw_smp_processor_id();
+#endif
+ if (subclass)
+ register_lock_class(lock, subclass, 1);
+}
+
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, int hardirqs_off,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct lock_class *class = NULL;
+ struct held_lock *hlock;
+ unsigned int depth, id;
+ int chain_head = 0;
+ u64 chain_key;
+
+ if (!prove_locking)
+ check = 1;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+
+ if (!subclass)
+ class = lock->class_cache;
+ /*
+ * Not cached yet or subclass?
+ */
+ if (unlikely(!class)) {
+ class = register_lock_class(lock, subclass, 0);
+ if (!class)
+ return 0;
+ }
+ debug_atomic_inc((atomic_t *)&class->ops);
+ if (very_verbose(class)) {
+ printk("\nacquire class [%p] %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ }
+
+ /*
+ * Add the lock to the list of currently held locks.
+ * (we dont increase the depth just yet, up until the
+ * dependency checks are done)
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+ return 0;
+
+ hlock = curr->held_locks + depth;
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+ hlock->class_idx = class - lock_classes + 1;
+ hlock->acquire_ip = ip;
+ hlock->instance = lock;
+ hlock->nest_lock = nest_lock;
+ hlock->trylock = trylock;
+ hlock->read = read;
+ hlock->check = check;
+ hlock->hardirqs_off = !!hardirqs_off;
+#ifdef CONFIG_LOCK_STAT
+ hlock->waittime_stamp = 0;
+ hlock->holdtime_stamp = sched_clock();
+#endif
+
+ if (check == 2 && !mark_irqflags(curr, hlock))
+ return 0;
+
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
+ /*
+ * Calculate the chain hash: it's the combined hash of all the
+ * lock keys along the dependency chain. We save the hash value
+ * at every step so that we can get the current hash easily
+ * after unlock. The chain hash is then used to cache dependency
+ * results.
+ *
+ * The 'key ID' is what is the most compact key value to drive
+ * the hash, not class->key.
+ */
+ id = class - lock_classes;
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return 0;
+
+ chain_key = curr->curr_chain_key;
+ if (!depth) {
+ if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ return 0;
+ chain_head = 1;
+ }
+
+ hlock->prev_chain_key = chain_key;
+ if (separate_irq_context(curr, hlock)) {
+ chain_key = 0;
+ chain_head = 1;
+ }
+ chain_key = iterate_chain_key(chain_key, id);
+
+ if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ return 0;
+
+ curr->curr_chain_key = chain_key;
+ curr->lockdep_depth++;
+ check_chain_key(curr);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return 0;
+#endif
+ if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCK_DEPTH too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+
+ if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+ max_lockdep_depth = curr->lockdep_depth;
+
+ return 1;
+}
+
+static int
+print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: bad unlock balance detected! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is trying to release lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no more locks to release!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Common debugging checks for both nested and non-nested unlock:
+ */
+static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (unlikely(!debug_locks))
+ return 0;
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (curr->lockdep_depth <= 0)
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+ return 1;
+}
+
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+ unsigned int subclass, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class *class;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+ class = register_lock_class(lock, subclass, 0);
+ hlock->class_idx = class - lock_classes + 1;
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip))
+ return 0;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks in a
+ * potentially non-nested (out of order) manner. This is a
+ * relatively rare operation, as all the unlock APIs default
+ * to nested mode (which uses lock_release()):
+ */
+static int
+lock_release_non_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock, *prev_hlock;
+ unsigned int depth;
+ int i;
+
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+ lock_release_holdtime(hlock);
+
+ /*
+ * We have the right lock to unlock, 'hlock' points to it.
+ * Now we remove it from the stack, and add back the other
+ * entries (if any), recalculating the hash along the way:
+ */
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (i++; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip))
+ return 0;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static int lock_release_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock;
+ unsigned int depth;
+
+ /*
+ * Pop off the top of the lock stack:
+ */
+ depth = curr->lockdep_depth - 1;
+ hlock = curr->held_locks + depth;
+
+ /*
+ * Is the unlock non-nested:
+ */
+ if (hlock->instance != lock)
+ return lock_release_non_nested(curr, lock, ip);
+ curr->lockdep_depth--;
+
+ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
+ return 0;
+
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ lock_release_holdtime(hlock);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ hlock->prev_chain_key = 0;
+ hlock->class_idx = 0;
+ hlock->acquire_ip = 0;
+ hlock->irq_context = 0;
+#endif
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static void
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (!check_unlock(curr, lock, ip))
+ return;
+
+ if (nested) {
+ if (!lock_release_nested(curr, lock, ip))
+ return;
+ } else {
+ if (!lock_release_non_nested(curr, lock, ip))
+ return;
+ }
+
+ check_chain_key(curr);
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
+ defined(CONFIG_TRACE_IRQFLAGS)
+ if (!debug_locks)
+ return;
+
+ if (irqs_disabled_flags(flags)) {
+ if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-off.\n");
+ }
+ } else {
+ if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-on.\n");
+ }
+ }
+
+ /*
+ * We dont accurately track softirq state in e.g.
+ * hardirq contexts (such as on 4KSTACKS), so only
+ * check if not in hardirq contexts:
+ */
+ if (!hardirq_count()) {
+ if (softirq_count())
+ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+ else
+ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
+
+ if (!debug_locks)
+ print_irqtrace_events(current);
+#endif
+}
+
+void
+lock_set_subclass(struct lockdep_map *lock,
+ unsigned int subclass, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_set_subclass(lock, subclass, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_acquire(lock, subclass, trylock, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested,
+ unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_release(lock, nested, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_release);
+
+#ifdef CONFIG_LOCK_STAT
+static int
+print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=================================\n");
+ printk( "[ BUG: bad contention detected! ]\n");
+ printk( "---------------------------------\n");
+ printk("%s/%d is trying to contend lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no locks held!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static void
+__lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ int i, point;
+
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ print_lock_contention_bug(curr, lock, ip);
+ return;
+
+found_it:
+ hlock->waittime_stamp = sched_clock();
+
+ point = lock_contention_point(hlock_class(hlock), ip);
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (point < ARRAY_SIZE(stats->contention_point))
+ stats->contention_point[point]++;
+ if (lock->cpu != smp_processor_id())
+ stats->bounces[bounce_contended + !!hlock->read]++;
+ put_lock_stats(stats);
+}
+
+static void
+__lock_acquired(struct lockdep_map *lock)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ u64 now;
+ s64 waittime = 0;
+ int i, cpu;
+
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
+
+found_it:
+ cpu = smp_processor_id();
+ if (hlock->waittime_stamp) {
+ now = sched_clock();
+ waittime = now - hlock->waittime_stamp;
+ hlock->holdtime_stamp = now;
+ }
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (waittime) {
+ if (hlock->read)
+ lock_time_inc(&stats->read_waittime, waittime);
+ else
+ lock_time_inc(&stats->write_waittime, waittime);
+ }
+ if (lock->cpu != cpu)
+ stats->bounces[bounce_acquired + !!hlock->read]++;
+ put_lock_stats(stats);
+
+ lock->cpu = cpu;
+}
+
+void lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_contended(lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_contended);
+
+void lock_acquired(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_acquired(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquired);
+#endif
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_local_irq_save(flags);
+ current->curr_chain_key = 0;
+ current->lockdep_depth = 0;
+ current->lockdep_recursion = 0;
+ memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+ nr_hardirq_chains = 0;
+ nr_softirq_chains = 0;
+ nr_process_chains = 0;
+ debug_locks = 1;
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+ raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+ int i;
+
+ /*
+ * Remove all dependencies this lock is
+ * involved in:
+ */
+ for (i = 0; i < nr_list_entries; i++) {
+ if (list_entries[i].class == class)
+ list_del_rcu(&list_entries[i].entry);
+ }
+ /*
+ * Unhash the class and remove it from the all_lock_classes list:
+ */
+ list_del_rcu(&class->hash_entry);
+ list_del_rcu(&class->lock_entry);
+
+ class->key = NULL;
+}
+
+static inline int within(const void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i;
+ int locked;
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+
+ /*
+ * Unhash all classes that were created by this module:
+ */
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry) {
+ if (within(class->key, start, size))
+ zap_class(class);
+ else if (within(class->name, start, size))
+ zap_class(class);
+ }
+ }
+
+ if (locked)
+ graph_unlock();
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i, j;
+ int locked;
+
+ raw_local_irq_save(flags);
+
+ /*
+ * Remove all classes this lock might have:
+ */
+ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+ /*
+ * If the class exists we look it up and zap it:
+ */
+ class = look_up_lock_class(lock, j);
+ if (class)
+ zap_class(class);
+ }
+ /*
+ * Debug check: in the end all mapped classes should
+ * be gone.
+ */
+ locked = graph_lock();
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry) {
+ if (unlikely(class == lock->class_cache)) {
+ if (debug_locks_off_graph_unlock())
+ WARN_ON(1);
+ goto out_restore;
+ }
+ }
+ }
+ if (locked)
+ graph_unlock();
+
+out_restore:
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_init(void)
+{
+ int i;
+
+ /*
+ * Some architectures have their own start_kernel()
+ * code which calls lockdep_init(), while we also
+ * call lockdep_init() from the start_kernel() itself,
+ * and we want to initialize the hashes only once:
+ */
+ if (lockdep_initialized)
+ return;
+
+ for (i = 0; i < CLASSHASH_SIZE; i++)
+ INIT_LIST_HEAD(classhash_table + i);
+
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+
+ lockdep_initialized = 1;
+}
+
+void __init lockdep_info(void)
+{
+ printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ printk(" memory used by lock dependency info: %lu kB\n",
+ (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+ sizeof(struct list_head) * CLASSHASH_SIZE +
+ sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+ sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+ sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+
+ printk(" per task-struct memory footprint: %lu bytes\n",
+ sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (lockdep_init_error) {
+ printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+ printk("Call stack leading to lockdep invocation was:\n");
+ print_stack_trace(&lockdep_init_trace, 0);
+ }
+#endif
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+ const void *mem_to, struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=========================\n");
+ printk( "[ BUG: held lock freed! ]\n");
+ printk( "-------------------------\n");
+ printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
+ print_lock(hlock);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+static inline int not_in_range(const void* mem_from, unsigned long mem_len,
+ const void* lock_from, unsigned long lock_len)
+{
+ return lock_from + lock_len <= mem_from ||
+ mem_from + mem_len <= lock_from;
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned long flags;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ local_irq_save(flags);
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (not_in_range(mem_from, mem_len, hlock->instance,
+ sizeof(*hlock->instance)))
+ continue;
+
+ print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
+ break;
+ }
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
+
+static void print_held_locks_bug(struct task_struct *curr)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: lock held at task exit time! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is exiting with locks still held!\n",
+ curr->comm, task_pid_nr(curr));
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+void debug_check_no_locks_held(struct task_struct *task)
+{
+ if (unlikely(task->lockdep_depth > 0))
+ print_held_locks_bug(task);
+}
+
+void debug_show_all_locks(void)
+{
+ struct task_struct *g, *p;
+ int count = 10;
+ int unlock = 1;
+
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
+ printk("\nShowing all locks held in the system:\n");
+
+ /*
+ * Here we try to get the tasklist_lock as hard as possible,
+ * if not successful after 2 seconds we ignore it (but keep
+ * trying). This is to enable a debug printout even if a
+ * tasklist_lock-holding task deadlocks or crashes.
+ */
+retry:
+ if (!read_trylock(&tasklist_lock)) {
+ if (count == 10)
+ printk("hm, tasklist_lock locked, retrying... ");
+ if (count) {
+ count--;
+ printk(" #%d", 10-count);
+ mdelay(200);
+ goto retry;
+ }
+ printk(" ignoring it.\n");
+ unlock = 0;
+ } else {
+ if (count != 10)
+ printk(KERN_CONT " locked it.\n");
+ }
+
+ do_each_thread(g, p) {
+ /*
+ * It's not reliable to print a task's held locks
+ * if it's not sleeping (or if it's not the current
+ * task):
+ */
+ if (p->state == TASK_RUNNING && p != current)
+ continue;
+ if (p->lockdep_depth)
+ lockdep_print_held_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+ printk("\n");
+ printk("=============================================\n\n");
+
+ if (unlock)
+ read_unlock(&tasklist_lock);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
+{
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+ __debug_show_held_locks(task);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
+void lockdep_sys_exit(void)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(curr->lockdep_depth)) {
+ if (!debug_locks_off())
+ return;
+ printk("\n================================================\n");
+ printk( "[ BUG: lock held when returning to user space! ]\n");
+ printk( "------------------------------------------------\n");
+ printk("%s/%d is leaving the kernel with locks still held!\n",
+ curr->comm, curr->pid);
+ lockdep_print_held_locks(curr);
+ }
+}
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 0000000..56b1969
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,97 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES 8192UL
+
+#define MAX_LOCKDEP_CHAINS_BITS 14
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+
+extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
+
+extern void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern int nr_chain_hlocks;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+ return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+extern atomic_t chain_lookup_hits;
+extern atomic_t chain_lookup_misses;
+extern atomic_t hardirqs_on_events;
+extern atomic_t hardirqs_off_events;
+extern atomic_t redundant_hardirqs_on;
+extern atomic_t redundant_hardirqs_off;
+extern atomic_t softirqs_on_events;
+extern atomic_t softirqs_off_events;
+extern atomic_t redundant_softirqs_on;
+extern atomic_t redundant_softirqs_off;
+extern atomic_t nr_unused_locks;
+extern atomic_t nr_cyclic_checks;
+extern atomic_t nr_cyclic_check_recursions;
+extern atomic_t nr_find_usage_forwards_checks;
+extern atomic_t nr_find_usage_forwards_recursions;
+extern atomic_t nr_find_usage_backwards_checks;
+extern atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
new file mode 100644
index 0000000..20dbcbf
--- /dev/null
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,746 @@
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct lock_class *class;
+
+ (*pos)++;
+
+ if (v == SEQ_START_TOKEN)
+ class = m->private;
+ else {
+ class = v;
+
+ if (class->lock_entry.next != &all_lock_classes)
+ class = list_entry(class->lock_entry.next,
+ struct lock_class, lock_entry);
+ else
+ class = NULL;
+ }
+
+ return class;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_class *class;
+ loff_t i = 0;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ if (++i == *pos)
+ return class;
+ }
+ return NULL;
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static void print_name(struct seq_file *m, struct lock_class *class)
+{
+ char str[128];
+ const char *name = class->name;
+
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ seq_printf(m, "%s", name);
+ } else{
+ seq_printf(m, "%s", name);
+ if (class->name_version > 1)
+ seq_printf(m, "#%d", class->name_version);
+ if (class->subclass)
+ seq_printf(m, "/%d", class->subclass);
+ }
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class = v;
+ struct lock_list *entry;
+ char c1, c2, c3, c4;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, "all lock classes:\n");
+ return 0;
+ }
+
+ seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+#endif
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+ seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+
+ seq_printf(m, ": ");
+ print_name(m, class);
+ seq_puts(m, "\n");
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int lockdep_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &lockdep_ops);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+
+ if (!list_empty(&all_lock_classes))
+ m->private = list_entry(all_lock_classes.next,
+ struct lock_class, lock_entry);
+ else
+ m->private = NULL;
+ }
+ return res;
+}
+
+static const struct file_operations proc_lockdep_operations = {
+ .open = lockdep_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_PROVE_LOCKING
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct lock_chain *chain;
+
+ (*pos)++;
+
+ if (v == SEQ_START_TOKEN)
+ chain = m->private;
+ else {
+ chain = v;
+
+ if (*pos < nr_lock_chains)
+ chain = lock_chains + *pos;
+ else
+ chain = NULL;
+ }
+
+ return chain;
+}
+
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ if (*pos < nr_lock_chains)
+ return lock_chains + *pos;
+
+ return NULL;
+}
+
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+
+static int lc_show(struct seq_file *m, void *v)
+{
+ struct lock_chain *chain = v;
+ struct lock_class *class;
+ int i;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, "all lock chains:\n");
+ return 0;
+ }
+
+ seq_printf(m, "irq_context: %d\n", chain->irq_context);
+
+ for (i = 0; i < chain->depth; i++) {
+ class = lock_chain_get_class(chain, i);
+ if (!class->key)
+ continue;
+
+ seq_printf(m, "[%p] ", class->key);
+ print_name(m, class);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_chains_ops = {
+ .start = lc_start,
+ .next = lc_next,
+ .stop = lc_stop,
+ .show = lc_show,
+};
+
+static int lockdep_chains_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &lockdep_chains_ops);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+
+ if (nr_lock_chains)
+ m->private = lock_chains;
+ else
+ m->private = NULL;
+ }
+ return res;
+}
+
+static const struct file_operations proc_lockdep_chains_operations = {
+ .open = lockdep_chains_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
+ hi2 = debug_atomic_read(&hardirqs_off_events),
+ hr1 = debug_atomic_read(&redundant_hardirqs_on),
+ hr2 = debug_atomic_read(&redundant_hardirqs_off),
+ si1 = debug_atomic_read(&softirqs_on_events),
+ si2 = debug_atomic_read(&softirqs_off_events),
+ sr1 = debug_atomic_read(&redundant_softirqs_on),
+ sr2 = debug_atomic_read(&redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11u\n",
+ debug_atomic_read(&chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11u\n",
+ debug_atomic_read(&chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11u\n",
+ debug_atomic_read(&nr_cyclic_checks));
+ seq_printf(m, " cyclic-check recursions: %11u\n",
+ debug_atomic_read(&nr_cyclic_check_recursions));
+ seq_printf(m, " find-mask forwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask forwards recursions: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_recursions));
+ seq_printf(m, " find-mask backwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_checks));
+ seq_printf(m, " find-mask backwards recursions:%11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_recursions));
+
+ seq_printf(m, " hardirq on events: %11u\n", hi1);
+ seq_printf(m, " hardirq off events: %11u\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
+ seq_printf(m, " softirq on events: %11u\n", si1);
+ seq_printf(m, " softirq off events: %11u\n", si2);
+ seq_printf(m, " redundant softirq ons: %11u\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11u\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class;
+ unsigned long nr_unused = 0, nr_uncategorized = 0,
+ nr_irq_safe = 0, nr_irq_unsafe = 0,
+ nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+ nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+ nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+ nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+ nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+ sum_forward_deps = 0, factor = 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+ if (class->usage_mask == 0)
+ nr_unused++;
+ if (class->usage_mask == LOCKF_USED)
+ nr_uncategorized++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ)
+ nr_irq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS)
+ nr_irq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ nr_softirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ nr_softirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ nr_hardirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ nr_hardirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+ nr_irq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+ nr_irq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+ nr_softirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ nr_softirq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+ nr_hardirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ nr_hardirq_read_unsafe++;
+
+#ifdef CONFIG_PROVE_LOCKING
+ sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+#endif
+ seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
+ nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
+ nr_list_entries, MAX_LOCKDEP_ENTRIES);
+ seq_printf(m, " indirect dependencies: %11lu\n",
+ sum_forward_deps);
+
+ /*
+ * Total number of dependencies:
+ *
+ * All irq-safe locks may nest inside irq-unsafe locks,
+ * plus all the other known dependencies:
+ */
+ seq_printf(m, " all direct dependencies: %11lu\n",
+ nr_irq_unsafe * nr_irq_safe +
+ nr_hardirq_unsafe * nr_hardirq_safe +
+ nr_list_entries);
+
+ /*
+ * Estimated factor between direct and indirect
+ * dependencies:
+ */
+ if (nr_list_entries)
+ factor = sum_forward_deps / nr_list_entries;
+
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
+ nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
+ nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ seq_printf(m, " in-hardirq chains: %11u\n",
+ nr_hardirq_chains);
+ seq_printf(m, " in-softirq chains: %11u\n",
+ nr_softirq_chains);
+#endif
+ seq_printf(m, " in-process chains: %11u\n",
+ nr_process_chains);
+ seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
+ nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+ seq_printf(m, " combined max dependencies: %11u\n",
+ (nr_hardirq_chains + 1) *
+ (nr_softirq_chains + 1) *
+ (nr_process_chains + 1)
+ );
+ seq_printf(m, " hardirq-safe locks: %11lu\n",
+ nr_hardirq_safe);
+ seq_printf(m, " hardirq-unsafe locks: %11lu\n",
+ nr_hardirq_unsafe);
+ seq_printf(m, " softirq-safe locks: %11lu\n",
+ nr_softirq_safe);
+ seq_printf(m, " softirq-unsafe locks: %11lu\n",
+ nr_softirq_unsafe);
+ seq_printf(m, " irq-safe locks: %11lu\n",
+ nr_irq_safe);
+ seq_printf(m, " irq-unsafe locks: %11lu\n",
+ nr_irq_unsafe);
+
+ seq_printf(m, " hardirq-read-safe locks: %11lu\n",
+ nr_hardirq_read_safe);
+ seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
+ nr_hardirq_read_unsafe);
+ seq_printf(m, " softirq-read-safe locks: %11lu\n",
+ nr_softirq_read_safe);
+ seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
+ nr_softirq_read_unsafe);
+ seq_printf(m, " irq-read-safe locks: %11lu\n",
+ nr_irq_read_safe);
+ seq_printf(m, " irq-read-unsafe locks: %11lu\n",
+ nr_irq_read_unsafe);
+
+ seq_printf(m, " uncategorized locks: %11lu\n",
+ nr_uncategorized);
+ seq_printf(m, " unused locks: %11lu\n",
+ nr_unused);
+ seq_printf(m, " max locking depth: %11u\n",
+ max_lockdep_depth);
+ seq_printf(m, " max recursion depth: %11u\n",
+ max_recursion_depth);
+ lockdep_stats_debug_show(m);
+ seq_printf(m, " debug_locks: %11u\n",
+ debug_locks);
+
+ return 0;
+}
+
+static int lockdep_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lockdep_stats_show, NULL);
+}
+
+static const struct file_operations proc_lockdep_stats_operations = {
+ .open = lockdep_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_LOCK_STAT
+
+struct lock_stat_data {
+ struct lock_class *class;
+ struct lock_class_stats stats;
+};
+
+struct lock_stat_seq {
+ struct lock_stat_data *iter;
+ struct lock_stat_data *iter_end;
+ struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
+};
+
+/*
+ * sort on absolute number of contentions
+ */
+static int lock_stat_cmp(const void *l, const void *r)
+{
+ const struct lock_stat_data *dl = l, *dr = r;
+ unsigned long nl, nr;
+
+ nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
+ nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
+
+ return nr - nl;
+}
+
+static void seq_line(struct seq_file *m, char c, int offset, int length)
+{
+ int i;
+
+ for (i = 0; i < offset; i++)
+ seq_puts(m, " ");
+ for (i = 0; i < length; i++)
+ seq_printf(m, "%c", c);
+ seq_puts(m, "\n");
+}
+
+static void snprint_time(char *buf, size_t bufsiz, s64 nr)
+{
+ unsigned long rem;
+
+ nr += 5; /* for display rounding */
+ rem = do_div(nr, 1000); /* XXX: do_div_signed */
+ snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
+}
+
+static void seq_time(struct seq_file *m, s64 time)
+{
+ char num[15];
+
+ snprint_time(num, sizeof(num), time);
+ seq_printf(m, " %14s", num);
+}
+
+static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
+{
+ seq_printf(m, "%14lu", lt->nr);
+ seq_time(m, lt->min);
+ seq_time(m, lt->max);
+ seq_time(m, lt->total);
+}
+
+static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
+{
+ char name[39];
+ struct lock_class *class;
+ struct lock_class_stats *stats;
+ int i, namelen;
+
+ class = data->class;
+ stats = &data->stats;
+
+ namelen = 38;
+ if (class->name_version > 1)
+ namelen -= 2; /* XXX truncates versions > 9 */
+ if (class->subclass)
+ namelen -= 2;
+
+ if (!class->name) {
+ char str[KSYM_NAME_LEN];
+ const char *key_name;
+
+ key_name = __get_key_name(class->key, str);
+ snprintf(name, namelen, "%s", key_name);
+ } else {
+ snprintf(name, namelen, "%s", class->name);
+ }
+ namelen = strlen(name);
+ if (class->name_version > 1) {
+ snprintf(name+namelen, 3, "#%d", class->name_version);
+ namelen += 2;
+ }
+ if (class->subclass) {
+ snprintf(name+namelen, 3, "/%d", class->subclass);
+ namelen += 2;
+ }
+
+ if (stats->write_holdtime.nr) {
+ if (stats->read_holdtime.nr)
+ seq_printf(m, "%38s-W:", name);
+ else
+ seq_printf(m, "%40s:", name);
+
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
+ seq_lock_time(m, &stats->write_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
+ seq_lock_time(m, &stats->write_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_holdtime.nr) {
+ seq_printf(m, "%38s-R:", name);
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
+ seq_lock_time(m, &stats->read_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
+ seq_lock_time(m, &stats->read_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
+ return;
+
+ if (stats->read_holdtime.nr)
+ namelen += 2;
+
+ for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+ char sym[KSYM_SYMBOL_LEN];
+ char ip[32];
+
+ if (class->contention_point[i] == 0)
+ break;
+
+ if (!i)
+ seq_line(m, '-', 40-namelen, namelen);
+
+ sprint_symbol(sym, class->contention_point[i]);
+ snprintf(ip, sizeof(ip), "[<%p>]",
+ (void *)class->contention_point[i]);
+ seq_printf(m, "%40s %14lu %29s %s\n", name,
+ stats->contention_point[i],
+ ip, sym);
+ }
+ if (i) {
+ seq_puts(m, "\n");
+ seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
+ seq_puts(m, "\n");
+ }
+}
+
+static void seq_header(struct seq_file *m)
+{
+ seq_printf(m, "lock_stat version 0.2\n");
+ seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+ seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+ "%14s %14s\n",
+ "class name",
+ "con-bounces",
+ "contentions",
+ "waittime-min",
+ "waittime-max",
+ "waittime-total",
+ "acq-bounces",
+ "acquisitions",
+ "holdtime-min",
+ "holdtime-max",
+ "holdtime-total");
+ seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+ seq_printf(m, "\n");
+}
+
+static void *ls_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_stat_seq *data = m->private;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ data->iter = data->stats + *pos;
+ if (data->iter >= data->iter_end)
+ data->iter = NULL;
+
+ return data->iter;
+}
+
+static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct lock_stat_seq *data = m->private;
+
+ (*pos)++;
+
+ if (v == SEQ_START_TOKEN)
+ data->iter = data->stats;
+ else {
+ data->iter = v;
+ data->iter++;
+ }
+
+ if (data->iter == data->iter_end)
+ data->iter = NULL;
+
+ return data->iter;
+}
+
+static void ls_stop(struct seq_file *m, void *v)
+{
+}
+
+static int ls_show(struct seq_file *m, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_header(m);
+ else
+ seq_stats(m, v);
+
+ return 0;
+}
+
+static struct seq_operations lockstat_ops = {
+ .start = ls_start,
+ .next = ls_next,
+ .stop = ls_stop,
+ .show = ls_show,
+};
+
+static int lock_stat_open(struct inode *inode, struct file *file)
+{
+ int res;
+ struct lock_class *class;
+ struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
+
+ if (!data)
+ return -ENOMEM;
+
+ res = seq_open(file, &lockstat_ops);
+ if (!res) {
+ struct lock_stat_data *iter = data->stats;
+ struct seq_file *m = file->private_data;
+
+ data->iter = iter;
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iter->class = class;
+ iter->stats = lock_stats(class);
+ iter++;
+ }
+ data->iter_end = iter;
+
+ sort(data->stats, data->iter_end - data->iter,
+ sizeof(struct lock_stat_data),
+ lock_stat_cmp, NULL);
+
+ m->private = data;
+ } else
+ vfree(data);
+
+ return res;
+}
+
+static ssize_t lock_stat_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct lock_class *class;
+ char c;
+
+ if (count) {
+ if (get_user(c, buf))
+ return -EFAULT;
+
+ if (c != '0')
+ return count;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry)
+ clear_lock_stats(class);
+ }
+ return count;
+}
+
+static int lock_stat_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ vfree(seq->private);
+ seq->private = NULL;
+ return seq_release(inode, file);
+}
+
+static const struct file_operations proc_lock_stat_operations = {
+ .open = lock_stat_open,
+ .write = lock_stat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = lock_stat_release,
+};
+#endif /* CONFIG_LOCK_STAT */
+
+static int __init lockdep_proc_init(void)
+{
+ proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+#ifdef CONFIG_PROVE_LOCKING
+ proc_create("lockdep_chains", S_IRUSR, NULL,
+ &proc_lockdep_chains_operations);
+#endif
+ proc_create("lockdep_stats", S_IRUSR, NULL,
+ &proc_lockdep_stats_operations);
+
+#ifdef CONFIG_LOCK_STAT
+ proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+#endif
+
+ return 0;
+}
+
+__initcall(lockdep_proc_init);
+
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 0000000..e9c6b2b
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+extern struct marker __start___markers[];
+extern struct marker __stop___markers[];
+
+/* Set to 1 to enable marker debug output */
+static const int marker_debug;
+
+/*
+ * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
+ * and module markers and the hash table.
+ */
+static DEFINE_MUTEX(markers_mutex);
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker. It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
+struct marker_entry {
+ struct hlist_node hlist;
+ char *format;
+ /* Probe wrapper */
+ void (*call)(const struct marker *mdata, void *call_private, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_closure *multi;
+ int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ int rcu_pending;
+ unsigned char ptype:1;
+ char name[0]; /* Contains name'\0'format'\0' */
+};
+
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+
+/**
+ * __mark_empty_function - Empty probe callback
+ * @probe_private: probe private data
+ * @call_private: call site private data
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we make sure the execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+void __mark_empty_function(void *probe_private, void *call_private,
+ const char *fmt, va_list *args)
+{
+}
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...: Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+{
+ va_list args;
+ char ptype;
+
+ /*
+ * rcu_read_lock_sched does two things : disabling preemption to make
+ * sure the teardown of the callbacks can be done correctly when they
+ * are in modules and they insure RCU read coherency.
+ */
+ rcu_read_lock_sched();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ va_start(args, call_private);
+ func(mdata->single.probe_private, call_private, mdata->format,
+ &args);
+ va_end(args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ for (i = 0; multi[i].func; i++) {
+ va_start(args, call_private);
+ multi[i].func(multi[i].probe_private, call_private,
+ mdata->format, &args);
+ va_end(args);
+ }
+ }
+ rcu_read_unlock_sched();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...: Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+{
+ va_list args; /* not initialized */
+ char ptype;
+
+ rcu_read_lock_sched();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func(mdata->single.probe_private, call_private, mdata->format,
+ &args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ for (i = 0; multi[i].func; i++)
+ multi[i].func(multi[i].probe_private, call_private,
+ mdata->format, &args);
+ }
+ rcu_read_unlock_sched();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct marker_entry *entry = container_of(head,
+ struct marker_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+ int i;
+
+ if (!marker_debug)
+ return;
+
+ if (!entry->ptype) {
+ printk(KERN_DEBUG "Single probe : %p %p\n",
+ entry->single.func,
+ entry->single.probe_private);
+ } else {
+ for (i = 0; entry->multi[i].func; i++)
+ printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+ entry->multi[i].func,
+ entry->multi[i].probe_private);
+ }
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0;
+ struct marker_probe_closure *old, *new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->multi;
+ if (!entry->ptype) {
+ if (entry->single.func == probe &&
+ entry->single.probe_private == probe_private)
+ return ERR_PTR(-EBUSY);
+ if (entry->single.func == __mark_empty_function) {
+ /* 0 -> 1 probes */
+ entry->single.func = probe;
+ entry->single.probe_private = probe_private;
+ entry->refcount = 1;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* 1 -> 2 probes */
+ nr_probes = 1;
+ old = NULL;
+ }
+ } else {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe
+ && old[nr_probes].probe_private
+ == probe_private)
+ return ERR_PTR(-EBUSY);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (!old)
+ new[0] = entry->single;
+ else
+ memcpy(new, old,
+ nr_probes * sizeof(struct marker_probe_closure));
+ new[nr_probes].func = probe;
+ new[nr_probes].probe_private = probe_private;
+ entry->refcount = nr_probes + 1;
+ entry->multi = new;
+ entry->ptype = 1;
+ debug_print_probes(entry);
+ return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct marker_probe_closure *old, *new;
+
+ old = entry->multi;
+
+ debug_print_probes(entry);
+ if (!entry->ptype) {
+ /* 0 -> N is an error */
+ WARN_ON(entry->single.func == __mark_empty_function);
+ /* 1 -> 0 probes */
+ WARN_ON(probe && entry->single.func != probe);
+ WARN_ON(entry->single.probe_private != probe_private);
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if ((!probe || old[nr_probes].func == probe)
+ && old[nr_probes].probe_private
+ == probe_private)
+ nr_del++;
+ }
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ } else if (nr_probes - nr_del == 1) {
+ /* N -> 1, (N > 1) */
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ entry->single = old[i];
+ entry->refcount = 1;
+ entry->ptype = 0;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 1) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(struct marker_probe_closure), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->ptype = 1;
+ entry->multi = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *get_marker(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ u32 hash = jhash(name, strlen(name), 0);
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name))
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static struct marker_entry *add_marker(const char *name, const char *format)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t name_len = strlen(name) + 1;
+ size_t format_len = 0;
+ u32 hash = jhash(name, name_len-1, 0);
+
+ if (format)
+ format_len = strlen(format) + 1;
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ printk(KERN_NOTICE
+ "Marker %s busy\n", name);
+ return ERR_PTR(-EBUSY); /* Already there */
+ }
+ }
+ /*
+ * Using kmalloc here to allocate a variable length element. Could
+ * cause some memory fragmentation if overused.
+ */
+ e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+ GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+ memcpy(&e->name[0], name, name_len);
+ if (format) {
+ e->format = &e->name[name_len];
+ memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ trace_mark(core_marker_format, "name %s format %s",
+ e->name, e->format);
+ } else {
+ e->format = NULL;
+ e->call = marker_probe_cb;
+ }
+ e->single.func = __mark_empty_function;
+ e->single.probe_private = NULL;
+ e->multi = NULL;
+ e->ptype = 0;
+ e->refcount = 0;
+ e->rcu_pending = 0;
+ hlist_add_head(&e->hlist, head);
+ return e;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held.
+ */
+static int remove_marker(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ int found = 0;
+ size_t len = strlen(name) + 1;
+ u32 hash = jhash(name, len-1, 0);
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+ if (e->single.func != __mark_empty_function)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu has been executed */
+ if (e->rcu_pending)
+ rcu_barrier_sched();
+ kfree(e);
+ return 0;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int marker_set_format(struct marker_entry **entry, const char *format)
+{
+ struct marker_entry *e;
+ size_t name_len = strlen((*entry)->name) + 1;
+ size_t format_len = strlen(format) + 1;
+
+
+ e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+ GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+ memcpy(&e->name[0], (*entry)->name, name_len);
+ e->format = &e->name[name_len];
+ memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ e->single = (*entry)->single;
+ e->multi = (*entry)->multi;
+ e->ptype = (*entry)->ptype;
+ e->refcount = (*entry)->refcount;
+ e->rcu_pending = 0;
+ hlist_add_before(&e->hlist, &(*entry)->hlist);
+ hlist_del(&(*entry)->hlist);
+ /* Make sure the call_rcu has been executed */
+ if ((*entry)->rcu_pending)
+ rcu_barrier_sched();
+ kfree(*entry);
+ *entry = e;
+ trace_mark(core_marker_format, "name %s format %s",
+ e->name, e->format);
+ return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one marker.
+ */
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+ int active)
+{
+ int ret;
+ WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+
+ if ((*entry)->format) {
+ if (strcmp((*entry)->format, elem->format) != 0) {
+ printk(KERN_NOTICE
+ "Format mismatch for probe %s "
+ "(%s), marker (%s)\n",
+ (*entry)->name,
+ (*entry)->format,
+ elem->format);
+ return -EPERM;
+ }
+ } else {
+ ret = marker_set_format(entry, elem->format);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * probe_cb setup (statically known) is done here. It is
+ * asynchronous with the rest of execution, therefore we only
+ * pass from a "safe" callback (with argument) to an "unsafe"
+ * callback (does not set arguments).
+ */
+ elem->call = (*entry)->call;
+ /*
+ * Sanity check :
+ * We only update the single probe private data when the ptr is
+ * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+ */
+ WARN_ON(elem->single.func != __mark_empty_function
+ && elem->single.probe_private
+ != (*entry)->single.probe_private &&
+ !elem->ptype);
+ elem->single.probe_private = (*entry)->single.probe_private;
+ /*
+ * Make sure the private data is valid when we update the
+ * single probe ptr.
+ */
+ smp_wmb();
+ elem->single.func = (*entry)->single.func;
+ /*
+ * We also make sure that the new probe callbacks array is consistent
+ * before setting a pointer to it.
+ */
+ rcu_assign_pointer(elem->multi, (*entry)->multi);
+ /*
+ * Update the function or multi probe array pointer before setting the
+ * ptype.
+ */
+ smp_wmb();
+ elem->ptype = (*entry)->ptype;
+ elem->state = active;
+
+ return 0;
+}
+
+/*
+ * Disable a marker and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by rcu_read_lock_sched around the call site.
+ */
+static void disable_marker(struct marker *elem)
+{
+ /* leave "call" as is. It is known statically. */
+ elem->state = 0;
+ elem->single.func = __mark_empty_function;
+ /* Update the function before setting the ptype */
+ smp_wmb();
+ elem->ptype = 0; /* single probe */
+ /*
+ * Leave the private data and id there, because removal is racy and
+ * should be done only after an RCU period. These are never used until
+ * the next initialization anyway.
+ */
+}
+
+/**
+ * marker_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of markers.
+ */
+void marker_update_probe_range(struct marker *begin,
+ struct marker *end)
+{
+ struct marker *iter;
+ struct marker_entry *mark_entry;
+
+ mutex_lock(&markers_mutex);
+ for (iter = begin; iter < end; iter++) {
+ mark_entry = get_marker(iter->name);
+ if (mark_entry) {
+ set_marker(&mark_entry, iter,
+ !!mark_entry->refcount);
+ /*
+ * ignore error, continue
+ */
+ } else {
+ disable_marker(iter);
+ }
+ }
+ mutex_unlock(&markers_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions. All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
+ */
+static void marker_update_probes(void)
+{
+ /* Core kernel markers */
+ marker_update_probe_range(__start___markers, __stop___markers);
+ /* Markers in modules. */
+ module_update_markers();
+}
+
+/**
+ * marker_probe_register - Connect a probe to a marker
+ * @name: marker name
+ * @format: format string
+ * @probe: probe handler
+ * @probe_private: probe private data
+ *
+ * private data must be a valid allocated memory address, or NULL.
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int marker_probe_register(const char *name, const char *format,
+ marker_probe_func *probe, void *probe_private)
+{
+ struct marker_entry *entry;
+ int ret = 0;
+ struct marker_probe_closure *old;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry) {
+ entry = add_marker(name, format);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ } else if (format) {
+ if (!entry->format)
+ ret = marker_set_format(&entry, format);
+ else if (strcmp(entry->format, format))
+ ret = -EPERM;
+ }
+ if (ret)
+ goto end;
+
+ /*
+ * If we detect that a call_rcu is pending for this marker,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = marker_entry_add_probe(entry, probe, probe_private);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
+ goto end;
+ }
+ mutex_unlock(&markers_mutex);
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu_sched(&entry->rcu, free_old_closure);
+end:
+ mutex_unlock(&markers_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_register);
+
+/**
+ * marker_probe_unregister - Disconnect a probe from a marker
+ * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
+ *
+ * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister(const char *name,
+ marker_probe_func *probe, void *probe_private)
+{
+ struct marker_entry *entry;
+ struct marker_probe_closure *old;
+ int ret = -ENOENT;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = marker_entry_remove_probe(entry, probe, probe_private);
+ mutex_unlock(&markers_mutex);
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu_sched(&entry->rcu, free_old_closure);
+ remove_marker(name); /* Ignore busy error message */
+ ret = 0;
+end:
+ mutex_unlock(&markers_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
+{
+ struct marker_entry *entry;
+ unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node;
+
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ if (!entry->ptype) {
+ if (entry->single.func == probe
+ && entry->single.probe_private
+ == probe_private)
+ return entry;
+ } else {
+ struct marker_probe_closure *closure;
+ closure = entry->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func == probe &&
+ closure[i].probe_private
+ == probe_private)
+ return entry;
+ }
+ }
+ }
+ }
+ return NULL;
+}
+
+/**
+ * marker_probe_unregister_private_data - Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
+ *
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *probe_private)
+{
+ struct marker_entry *entry;
+ int ret = 0;
+ struct marker_probe_closure *old;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker_from_private_data(probe, probe_private);
+ if (!entry) {
+ ret = -ENOENT;
+ goto end;
+ }
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = marker_entry_remove_probe(entry, NULL, probe_private);
+ mutex_unlock(&markers_mutex);
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker_from_private_data(probe, probe_private);
+ WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu_sched(&entry->rcu, free_old_closure);
+ remove_marker(entry->name); /* Ignore busy error message */
+end:
+ mutex_unlock(&markers_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
+
+/**
+ * marker_get_private_data - Get a marker's probe private data
+ * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
+ *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
+ * Returns the private data pointer, or an ERR_PTR.
+ * The private data pointer should _only_ be dereferenced if the caller is the
+ * owner of the data, or its content could vanish. This is mostly used to
+ * confirm that a caller is the owner of a registered probe.
+ */
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+ int num)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t name_len = strlen(name) + 1;
+ u32 hash = jhash(name, name_len-1, 0);
+ int i;
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ if (!e->ptype) {
+ if (num == 0 && e->single.func == probe)
+ return e->single.probe_private;
+ else
+ break;
+ } else {
+ struct marker_probe_closure *closure;
+ int match = 0;
+ closure = e->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func != probe)
+ continue;
+ if (match++ == num)
+ return closure[i].probe_private;
+ }
+ }
+ }
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(marker_get_private_data);
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 0000000..79fd4d2
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,2821 @@
+/*
+ Copyright (C) 2002 Richard Henderson
+ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/unwind.h>
+#include <linux/rculist.h>
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , a...)
+#endif
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+
+/* List of modules, protected by module_mutex or preempt_disable
+ * (delete uses stop_machine/add uses RCU list operations). */
+static DEFINE_MUTEX(module_mutex);
+static LIST_HEAD(modules);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+/* Bounds of module allocation, for speeding __module_text_address */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+int register_module_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/* We require a truly strong try_module_get(): 0 means failure due to
+ ongoing or failed initialization etc. */
+static inline int strong_try_module_get(struct module *mod)
+{
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return -EBUSY;
+ if (try_module_get(mod))
+ return 0;
+ else
+ return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag)
+{
+ add_taint(flag);
+ mod->taints |= (1U << flag);
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit. nfsd and lockd use this.
+ */
+void __module_put_and_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ do_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < hdr->e_shnum; i++)
+ /* Alloc bit cleared means "ignore it." */
+ if ((sechdrs[i].sh_flags & SHF_ALLOC)
+ && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+ return i;
+ return 0;
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
+ const char *secstrings, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = sechdrs[sec].sh_size / object_size;
+ return (void *)sechdrs[sec].sh_addr;
+}
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+extern const unsigned long __start___kcrctab[];
+extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
+extern const unsigned long __start___kcrctab_unused[];
+extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const unsigned long *crcs;
+ enum {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+ WILL_BE_GPL_ONLY,
+ } licence;
+ bool unused;
+};
+
+static bool each_symbol_in_section(const struct symsearch *arr,
+ unsigned int arrsize,
+ struct module *owner,
+ bool (*fn)(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data),
+ void *data)
+{
+ unsigned int i, j;
+
+ for (j = 0; j < arrsize; j++) {
+ for (i = 0; i < arr[j].stop - arr[j].start; i++)
+ if (fn(&arr[j], owner, i, data))
+ return true;
+ }
+
+ return false;
+}
+
+/* Returns true as soon as fn returns true, otherwise false. */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
+ struct module *owner,
+ unsigned int symnum, void *data),
+ void *data)
+{
+ struct module *mod;
+ const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY, false },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY, false },
+ { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
+ __start___kcrctab_gpl_future,
+ WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { __start___ksymtab_unused, __stop___ksymtab_unused,
+ __start___kcrctab_unused,
+ NOT_GPL_ONLY, true },
+ { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
+ __start___kcrctab_unused_gpl,
+ GPL_ONLY, true },
+#endif
+ };
+
+ if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY, false },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY, false },
+ { mod->gpl_future_syms,
+ mod->gpl_future_syms + mod->num_gpl_future_syms,
+ mod->gpl_future_crcs,
+ WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { mod->unused_syms,
+ mod->unused_syms + mod->num_unused_syms,
+ mod->unused_crcs,
+ NOT_GPL_ONLY, true },
+ { mod->unused_gpl_syms,
+ mod->unused_gpl_syms + mod->num_unused_gpl_syms,
+ mod->unused_gpl_crcs,
+ GPL_ONLY, true },
+#endif
+ };
+
+ if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+ return true;
+ }
+ return false;
+}
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const unsigned long *crc;
+ unsigned long value;
+};
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data)
+{
+ struct find_symbol_arg *fsa = data;
+
+ if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+ return false;
+
+ if (!fsa->gplok) {
+ if (syms->licence == GPL_ONLY)
+ return false;
+ if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ printk(KERN_WARNING "Symbol %s is being used "
+ "by a non-GPL module, which will not "
+ "be allowed in the future\n", fsa->name);
+ printk(KERN_WARNING "Please see the file "
+ "Documentation/feature-removal-schedule.txt "
+ "in the kernel source tree for more details.\n");
+ }
+ }
+
+#ifdef CONFIG_UNUSED_SYMBOLS
+ if (syms->unused && fsa->warn) {
+ printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+ "however this module is using it.\n", fsa->name);
+ printk(KERN_WARNING
+ "This symbol will go away in the future.\n");
+ printk(KERN_WARNING
+ "Please evalute if this is the right api to use and if "
+ "it really is, submit a report the linux kernel "
+ "mailinglist together with submitting your code for "
+ "inclusion.\n");
+ }
+#endif
+
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, symnum);
+ fsa->value = syms->start[symnum].value;
+ return true;
+}
+
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+ struct module **owner,
+ const unsigned long **crc,
+ bool gplok,
+ bool warn)
+{
+ struct find_symbol_arg fsa;
+
+ fsa.name = name;
+ fsa.gplok = gplok;
+ fsa.warn = warn;
+
+ if (each_symbol(find_symbol_in_section, &fsa)) {
+ if (owner)
+ *owner = fsa.owner;
+ if (crc)
+ *crc = fsa.crc;
+ return fsa.value;
+ }
+
+ DEBUGP("Failed to find symbol %s\n", name);
+ return -ENOENT;
+}
+
+/* Search for module by name: must hold module_mutex. */
+static struct module *find_module(const char *name)
+{
+ struct module *mod;
+
+ list_for_each_entry(mod, &modules, list) {
+ if (strcmp(mod->name, name) == 0)
+ return mod;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_SMP
+/* Number of blocks used and allocated. */
+static unsigned int pcpu_num_used, pcpu_num_allocated;
+/* Size of each block. -ve means used. */
+static int *pcpu_size;
+
+static int split_block(unsigned int i, unsigned short size)
+{
+ /* Reallocation required? */
+ if (pcpu_num_used + 1 > pcpu_num_allocated) {
+ int *new;
+
+ new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+ GFP_KERNEL);
+ if (!new)
+ return 0;
+
+ pcpu_num_allocated *= 2;
+ pcpu_size = new;
+ }
+
+ /* Insert a new subblock */
+ memmove(&pcpu_size[i+1], &pcpu_size[i],
+ sizeof(pcpu_size[0]) * (pcpu_num_used - i));
+ pcpu_num_used++;
+
+ pcpu_size[i+1] -= size;
+ pcpu_size[i] = size;
+ return 1;
+}
+
+static inline unsigned int block_size(int val)
+{
+ if (val < 0)
+ return -val;
+ return val;
+}
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
+{
+ unsigned long extra;
+ unsigned int i;
+ void *ptr;
+
+ if (align > PAGE_SIZE) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ ptr = __per_cpu_start;
+ for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+ /* Extra for alignment requirement. */
+ extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+ BUG_ON(i == 0 && extra != 0);
+
+ if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
+ continue;
+
+ /* Transfer extra to previous block. */
+ if (pcpu_size[i-1] < 0)
+ pcpu_size[i-1] -= extra;
+ else
+ pcpu_size[i-1] += extra;
+ pcpu_size[i] -= extra;
+ ptr += extra;
+
+ /* Split block if warranted */
+ if (pcpu_size[i] - size > sizeof(unsigned long))
+ if (!split_block(i, size))
+ return NULL;
+
+ /* Mark allocated */
+ pcpu_size[i] = -pcpu_size[i];
+ return ptr;
+ }
+
+ printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
+ size);
+ return NULL;
+}
+
+static void percpu_modfree(void *freeme)
+{
+ unsigned int i;
+ void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+
+ /* First entry is core kernel percpu data. */
+ for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+ if (ptr == freeme) {
+ pcpu_size[i] = -pcpu_size[i];
+ goto free;
+ }
+ }
+ BUG();
+
+ free:
+ /* Merge with previous? */
+ if (pcpu_size[i-1] >= 0) {
+ pcpu_size[i-1] += pcpu_size[i];
+ pcpu_num_used--;
+ memmove(&pcpu_size[i], &pcpu_size[i+1],
+ (pcpu_num_used - i) * sizeof(pcpu_size[0]));
+ i--;
+ }
+ /* Merge with next? */
+ if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
+ pcpu_size[i] += pcpu_size[i+1];
+ pcpu_num_used--;
+ memmove(&pcpu_size[i+1], &pcpu_size[i+2],
+ (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
+ }
+}
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
+static int percpu_modinit(void)
+{
+ pcpu_num_used = 2;
+ pcpu_num_allocated = 2;
+ pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
+ GFP_KERNEL);
+ /* Static in-kernel percpu data (used). */
+ pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
+ /* Free room. */
+ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
+ if (pcpu_size[1] < 0) {
+ printk(KERN_ERR "No per-cpu room for modules.\n");
+ pcpu_num_used = 1;
+ }
+
+ return 0;
+}
+__initcall(percpu_modinit);
+#else /* ... !CONFIG_SMP */
+static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
+{
+ return NULL;
+}
+static inline void percpu_modfree(void *pcpuptr)
+{
+ BUG();
+}
+static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ return 0;
+}
+static inline void percpu_modcopy(void *pcpudst, const void *src,
+ unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
+ struct module *mod, char *buffer) \
+{ \
+ return sprintf(buffer, "%s\n", mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444 }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
+#ifdef CONFIG_MODULE_UNLOAD
+/* Init the unload section of the module. */
+static void module_unload_init(struct module *mod)
+{
+ unsigned int i;
+
+ INIT_LIST_HEAD(&mod->modules_which_use_me);
+ for (i = 0; i < NR_CPUS; i++)
+ local_set(&mod->ref[i].count, 0);
+ /* Hold reference count during initialization. */
+ local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ /* Backwards compatibility macros put refcount during init. */
+ mod->waiter = current;
+}
+
+/* modules using other modules */
+struct module_use
+{
+ struct list_head list;
+ struct module *module_which_uses;
+};
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->modules_which_use_me, list) {
+ if (use->module_which_uses == a) {
+ DEBUGP("%s uses %s!\n", a->name, b->name);
+ return 1;
+ }
+ }
+ DEBUGP("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/* Module a uses b */
+static int use_module(struct module *a, struct module *b)
+{
+ struct module_use *use;
+ int no_warn, err;
+
+ if (b == NULL || already_uses(a, b)) return 1;
+
+ /* If we're interrupted or time out, we fail. */
+ if (wait_event_interruptible_timeout(
+ module_wq, (err = strong_try_module_get(b)) != -EBUSY,
+ 30 * HZ) <= 0) {
+ printk("%s: gave up waiting for init of module %s.\n",
+ a->name, b->name);
+ return 0;
+ }
+
+ /* If strong_try_module_get() returned a different error, we fail. */
+ if (err)
+ return 0;
+
+ DEBUGP("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use) {
+ printk("%s: out of memory loading\n", a->name);
+ module_put(b);
+ return 0;
+ }
+
+ use->module_which_uses = a;
+ list_add(&use->list, &b->modules_which_use_me);
+ no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
+ return 1;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module *i;
+
+ list_for_each_entry(i, &modules, list) {
+ struct module_use *use;
+
+ list_for_each_entry(use, &i->modules_which_use_me, list) {
+ if (use->module_which_uses == mod) {
+ DEBUGP("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->list);
+ kfree(use);
+ sysfs_remove_link(i->holders_dir, mod->name);
+ /* There can be at most one match. */
+ break;
+ }
+ }
+ }
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ add_taint(TAINT_FORCED_RMMOD);
+ return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+struct stopref
+{
+ struct module *mod;
+ int flags;
+ int *forced;
+};
+
+/* Whole machine is stopped with interrupts off when this runs. */
+static int __try_stop_module(void *_sref)
+{
+ struct stopref *sref = _sref;
+
+ /* If it's not unused, quit unless we're forcing. */
+ if (module_refcount(sref->mod) != 0) {
+ if (!(*sref->forced = try_force_unload(sref->flags)))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ sref->mod->state = MODULE_STATE_GOING;
+ return 0;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ if (flags & O_NONBLOCK) {
+ struct stopref sref = { mod, flags, forced };
+
+ return stop_machine(__try_stop_module, &sref, NULL);
+ } else {
+ /* We don't need to stop the machine for this. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ return 0;
+ }
+}
+
+unsigned int module_refcount(struct module *mod)
+{
+ unsigned int i, total = 0;
+
+ for (i = 0; i < NR_CPUS; i++)
+ total += local_read(&mod->ref[i].count);
+ return total;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+static void wait_for_zero_refcount(struct module *mod)
+{
+ /* Since we might sleep for some time, release the mutex first */
+ mutex_unlock(&module_mutex);
+ for (;;) {
+ DEBUGP("Looking at refcount...\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (module_refcount(mod) == 0)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ mutex_lock(&module_mutex);
+}
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ unsigned int, flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ int ret, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+
+ if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+ return -EFAULT;
+ name[MODULE_NAME_LEN-1] = '\0';
+
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->modules_which_use_me)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count and wake up
+ waiter --RR */
+ DEBUGP("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if (mod->init && !mod->exit) {
+ forced = try_force_unload(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* Set this up before setting mod->state */
+ mod->waiter = current;
+
+ /* Stop the machine so refcounts can't move and disable module. */
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ /* Never wait if forced. */
+ if (!forced && module_refcount(mod) != 0)
+ wait_for_zero_refcount(mod);
+
+ mutex_unlock(&module_mutex);
+ /* Final destruction now noone is using it. */
+ if (mod->exit != NULL)
+ mod->exit();
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ mutex_lock(&module_mutex);
+ /* Store the name of the last unloaded module for diagnostic purposes */
+ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+ unregister_dynamic_debug_module(mod->name);
+ free_module(mod);
+
+ out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %u ", module_refcount(mod));
+
+ /* Always include a trailing , so userspace can differentiate
+ between this and the old multi-field proc format. */
+ list_for_each_entry(use, &mod->modules_which_use_me, list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->module_which_uses->name);
+ }
+
+ if (mod->init != NULL && mod->exit == NULL) {
+ printed_something = 1;
+ seq_printf(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_printf(m, "-");
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct module *owner;
+
+ preempt_disable();
+ if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
+ BUG();
+ module_put(owner);
+ preempt_enable();
+}
+EXPORT_SYMBOL(__symbol_put);
+
+void symbol_put_addr(void *addr)
+{
+ struct module *modaddr;
+
+ if (core_kernel_text((unsigned long)addr))
+ return;
+
+ if (!(modaddr = module_text_address((unsigned long)addr)))
+ BUG();
+ module_put(modaddr);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+ struct module *mod, char *buffer)
+{
+ return sprintf(buffer, "%u\n", module_refcount(mod));
+}
+
+static struct module_attribute refcnt = {
+ .attr = { .name = "refcnt", .mode = 0444 },
+ .show = show_refcnt,
+};
+
+void module_put(struct module *module)
+{
+ if (module) {
+ unsigned int cpu = get_cpu();
+ local_dec(&module->ref[cpu].count);
+ /* Maybe they're waiting for us to drop reference? */
+ if (unlikely(!module_is_live(module)))
+ wake_up_process(module->waiter);
+ put_cpu();
+ }
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_printf(m, " - -");
+}
+
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static inline int use_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b) == 0;
+}
+
+static inline void module_unload_init(struct module *mod)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+static ssize_t show_initstate(struct module_attribute *mattr,
+ struct module *mod, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static struct module_attribute initstate = {
+ .attr = { .name = "initstate", .mode = 0444 },
+ .show = show_initstate,
+};
+
+static struct module_attribute *modinfo_attrs[] = {
+ &modinfo_version,
+ &modinfo_srcversion,
+ &initstate,
+#ifdef CONFIG_MODULE_UNLOAD
+ &refcnt,
+#endif
+ NULL,
+};
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!test_taint(TAINT_FORCED_MODULE))
+ printk("%s: no version for \"%s\" found: kernel tainted.\n",
+ mod->name, symname);
+ add_taint_module(mod, TAINT_FORCED_MODULE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
+#ifdef CONFIG_MODVERSIONS
+static int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc)
+{
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *) sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ if (versions[i].crc == *crc)
+ return 1;
+ DEBUGP("Found checksum %lX vs module %lX\n",
+ *crc, versions[i].crc);
+ goto bad_version;
+ }
+
+ printk(KERN_WARNING "%s: no symbol version for %s\n",
+ mod->name, symname);
+ return 0;
+
+bad_version:
+ printk("%s: disagrees about version of symbol %s\n",
+ mod->name, symname);
+ return 0;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ const unsigned long *crc;
+
+ if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
+ BUG();
+ return check_version(sechdrs, versindex, "struct_module", mod, crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+#else
+static inline int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage.
+ Must be holding module_mutex. */
+static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *name,
+ struct module *mod)
+{
+ struct module *owner;
+ unsigned long ret;
+ const unsigned long *crc;
+
+ ret = find_symbol(name, &owner, &crc,
+ !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
+ if (!IS_ERR_VALUE(ret)) {
+ /* use_module can fail due to OOM,
+ or module initialization or unloading */
+ if (!check_version(sechdrs, versindex, name, mod, crc) ||
+ !use_module(mod, owner))
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+struct module_sect_attr
+{
+ struct module_attribute mattr;
+ char *name;
+ unsigned long address;
+};
+
+struct module_sect_attrs
+{
+ struct attribute_group grp;
+ unsigned int nsections;
+ struct module_sect_attr attrs[0];
+};
+
+static ssize_t module_sect_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ struct module_sect_attr *sattr =
+ container_of(mattr, struct module_sect_attr, mattr);
+ return sprintf(buf, "0x%lx\n", sattr->address);
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ unsigned int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, unsigned int nsect,
+ char *secstrings, Elf_Shdr *sechdrs)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < nsect; i++)
+ if (sechdrs[i].sh_flags & SHF_ALLOC)
+ nloaded++;
+ size[0] = ALIGN(sizeof(*sect_attrs)
+ + nloaded * sizeof(sect_attrs->attrs[0]),
+ sizeof(sect_attrs->grp.attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (sect_attrs == NULL)
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+
+ sect_attrs->nsections = 0;
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.attrs[0];
+ for (i = 0; i < nsect; i++) {
+ if (! (sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+ sattr->address = sechdrs[i].sh_addr;
+ sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ GFP_KERNEL);
+ if (sattr->name == NULL)
+ goto out;
+ sect_attrs->nsections++;
+ sattr->mattr.show = module_sect_show;
+ sattr->mattr.store = NULL;
+ sattr->mattr.attr.name = sattr->name;
+ sattr->mattr.attr.mode = S_IRUGO;
+ *(gattr++) = &(sattr++)->mattr.attr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+ out:
+ free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /* We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately. */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct kobject *dir;
+ unsigned int notes;
+ struct bin_attribute attrs[0];
+};
+
+static ssize_t module_notes_read(struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ /*
+ * The caller checked the pos and count against our size.
+ */
+ memcpy(buf, bin_attr->private + pos, count);
+ return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ unsigned int i)
+{
+ if (notes_attrs->dir) {
+ while (i-- > 0)
+ sysfs_remove_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]);
+ kobject_put(notes_attrs->dir);
+ }
+ kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, unsigned int nsect,
+ char *secstrings, Elf_Shdr *sechdrs)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ struct bin_attribute *nattr;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < nsect; i++)
+ if ((sechdrs[i].sh_flags & SHF_ALLOC) &&
+ (sechdrs[i].sh_type == SHT_NOTE))
+ ++notes;
+
+ if (notes == 0)
+ return;
+
+ notes_attrs = kzalloc(sizeof(*notes_attrs)
+ + notes * sizeof(notes_attrs->attrs[0]),
+ GFP_KERNEL);
+ if (notes_attrs == NULL)
+ return;
+
+ notes_attrs->notes = notes;
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < nsect; ++i) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+ if (sechdrs[i].sh_type == SHT_NOTE) {
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
+ nattr->attr.mode = S_IRUGO;
+ nattr->size = sechdrs[i].sh_size;
+ nattr->private = (void *) sechdrs[i].sh_addr;
+ nattr->read = module_notes_read;
+ ++nattr;
+ }
+ ++loaded;
+ }
+
+ notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ if (!notes_attrs->dir)
+ goto out;
+
+ for (i = 0; i < notes; ++i)
+ if (sysfs_create_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]))
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return;
+
+ out:
+ free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs)
+ free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else
+
+static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
+ char *sectstrings, Elf_Shdr *sechdrs)
+{
+}
+
+static inline void remove_sect_attrs(struct module *mod)
+{
+}
+
+static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
+ char *sectstrings, Elf_Shdr *sechdrs)
+{
+}
+
+static inline void remove_notes_attrs(struct module *mod)
+{
+}
+#endif
+
+#ifdef CONFIG_SYSFS
+int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (ARRAY_SIZE(modinfo_attrs) + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+ if (!attr->test ||
+ (attr->test && attr->test(mod))) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ ++temp_attr;
+ }
+ }
+ return error;
+}
+
+void module_remove_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_sysfs_initialized) {
+ printk(KERN_ERR "%s: module sysfs not initialized\n",
+ mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ kobject_put(&mod->mkobj.kobj);
+
+ /* delay uevent until full sysfs population */
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+ return 0;
+
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ kobject_put(&mod->mkobj.kobj);
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ kobject_put(&mod->mkobj.kobj);
+}
+
+#else /* CONFIG_SYSFS */
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+#endif /* CONFIG_SYSFS */
+
+static void mod_kobject_remove(struct module *mod)
+{
+ module_remove_modinfo_attrs(mod);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+/*
+ * unlink the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __unlink_module(void *_mod)
+{
+ struct module *mod = _mod;
+ list_del(&mod->list);
+ return 0;
+}
+
+/* Free a module, remove from lists, etc (must hold module_mutex). */
+static void free_module(struct module *mod)
+{
+ /* Delete from various lists */
+ stop_machine(__unlink_module, mod, NULL);
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_remove(mod);
+
+ unwind_remove_table(mod->unwind_info, 0);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* release any pointers to mcount in this module */
+ ftrace_release(mod->module_core, mod->core_size);
+
+ /* This may be NULL, but that's OK */
+ module_free(mod, mod->module_init);
+ kfree(mod->args);
+ if (mod->percpu)
+ percpu_modfree(mod->percpu);
+
+ /* Free lock-classes: */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
+ /* Finally, free the core (containing the module structure) */
+ module_free(mod, mod->module_core);
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct module *owner;
+ unsigned long value;
+
+ preempt_disable();
+ value = find_symbol(symbol, &owner, NULL, true, true);
+ if (IS_ERR_VALUE(value))
+ value = 0;
+ else if (strong_try_module_get(owner))
+ value = 0;
+ preempt_enable();
+
+ return (void *)value;
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ */
+static int verify_export_symbols(struct module *mod)
+{
+ unsigned int i;
+ struct module *owner;
+ const struct kernel_symbol *s;
+ struct {
+ const struct kernel_symbol *sym;
+ unsigned int num;
+ } arr[] = {
+ { mod->syms, mod->num_syms },
+ { mod->gpl_syms, mod->num_gpl_syms },
+ { mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { mod->unused_syms, mod->num_unused_syms },
+ { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
+ NULL, true, false))) {
+ printk(KERN_ERR
+ "%s: exports duplicate symbol %s"
+ " (owned by %s)\n",
+ mod->name, s->name, module_name(owner));
+ return -ENOEXEC;
+ }
+ }
+ }
+ return 0;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ const char *strtab,
+ unsigned int versindex,
+ unsigned int pcpuindex,
+ struct module *mod)
+{
+ Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+ unsigned long secbase;
+ unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ int ret = 0;
+
+ for (i = 1; i < n; i++) {
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* We compiled with -fno-common. These are not
+ supposed to happen. */
+ DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+ printk("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ DEBUGP("Absolute symbol: 0x%08lx\n",
+ (long)sym[i].st_value);
+ break;
+
+ case SHN_UNDEF:
+ sym[i].st_value
+ = resolve_symbol(sechdrs, versindex,
+ strtab + sym[i].st_name, mod);
+
+ /* Ok if resolved. */
+ if (!IS_ERR_VALUE(sym[i].st_value))
+ break;
+ /* Ok if weak. */
+ if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ break;
+
+ printk(KERN_WARNING "%s: Unknown symbol %s\n",
+ mod->name, strtab + sym[i].st_name);
+ ret = -ENOENT;
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == pcpuindex)
+ secbase = (unsigned long)mod->percpu;
+ else
+ secbase = sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Update size with this section: return offset. */
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
+{
+ long ret;
+
+ ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+ *size = ret + sechdr->sh_size;
+ return ret;
+}
+
+/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ might -- code, read-only data, read-write data, small data. Tally
+ sizes, and place the offsets into sh_entsize fields: high bit means it
+ belongs in init. */
+static void layout_sections(struct module *mod,
+ const Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ static unsigned long const masks[][2] = {
+ /* NOTE: all executable code must be the first section
+ * in this array; otherwise modify the text_size
+ * finder in the two loops below */
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ unsigned int m, i;
+
+ for (i = 0; i < hdr->e_shnum; i++)
+ sechdrs[i].sh_entsize = ~0UL;
+
+ DEBUGP("Core section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &sechdrs[i];
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || strncmp(secstrings + s->sh_name,
+ ".init", 5) == 0)
+ continue;
+ s->sh_entsize = get_offset(&mod->core_size, s);
+ DEBUGP("\t%s\n", secstrings + s->sh_name);
+ }
+ if (m == 0)
+ mod->core_text_size = mod->core_size;
+ }
+
+ DEBUGP("Init section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &sechdrs[i];
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || strncmp(secstrings + s->sh_name,
+ ".init", 5) != 0)
+ continue;
+ s->sh_entsize = (get_offset(&mod->init_size, s)
+ | INIT_OFFSET_MASK);
+ DEBUGP("\t%s\n", secstrings + s->sh_name);
+ }
+ if (m == 0)
+ mod->init_text_size = mod->init_size;
+ }
+}
+
+static void set_license(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ if (!license_is_gpl_compatible(license)) {
+ if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ printk(KERN_WARNING "%s: module license '%s' taints "
+ "kernel.\n", mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ }
+}
+
+/* Parse tag=value strings from .modinfo section */
+static char *next_string(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_modinfo(Elf_Shdr *sechdrs,
+ unsigned int info,
+ const char *tag)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ unsigned long size = sechdrs[info].sh_size;
+
+ for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
+ unsigned int infoindex)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod,
+ get_modinfo(sechdrs,
+ infoindex,
+ attr->attr.name));
+ }
+}
+
+#ifdef CONFIG_KALLSYMS
+
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ const struct kernel_symbol *ks = start;
+ for (; ks < stop; ks++)
+ if (strcmp(ks->name, name) == 0)
+ return ks;
+ return NULL;
+}
+
+static int is_exported(const char *name, const struct module *mod)
+{
+ if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
+ return 1;
+ else
+ if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
+ return 1;
+ else
+ return 0;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ struct module *mod)
+{
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+ && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug", strlen(".debug")) == 0)
+ return 'n';
+ return '?';
+}
+
+static void add_kallsyms(struct module *mod,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ unsigned int strindex,
+ const char *secstrings)
+{
+ unsigned int i;
+
+ mod->symtab = (void *)sechdrs[symindex].sh_addr;
+ mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ mod->strtab = (void *)sechdrs[strindex].sh_addr;
+
+ /* Set types up while we still have access to sections. */
+ for (i = 0; i < mod->num_symtab; i++)
+ mod->symtab[i].st_info
+ = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+}
+#else
+static inline void add_kallsyms(struct module *mod,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ unsigned int strindex,
+ const char *secstrings)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+{
+#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ unsigned int i;
+
+ for (i = 0; i < num; i++) {
+ register_dynamic_debug_module(debug[i].modname,
+ debug[i].type,
+ debug[i].logical_modname,
+ debug[i].flag_names,
+ debug[i].hash, debug[i].hash2);
+ }
+#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+}
+
+static void *module_alloc_update_bounds(unsigned long size)
+{
+ void *ret = module_alloc(size);
+
+ if (ret) {
+ /* Update module bounds. */
+ if ((unsigned long)ret < module_addr_min)
+ module_addr_min = (unsigned long)ret;
+ if ((unsigned long)ret + size > module_addr_max)
+ module_addr_max = (unsigned long)ret + size;
+ }
+ return ret;
+}
+
+/* Allocate and load the module: note that size of section 0 is always
+ zero, and we rely on this for optional sections. */
+static noinline struct module *load_module(void __user *umod,
+ unsigned long len,
+ const char __user *uargs)
+{
+ Elf_Ehdr *hdr;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *args, *modmagic, *strtab = NULL;
+ char *staging;
+ unsigned int i;
+ unsigned int symindex = 0;
+ unsigned int strindex = 0;
+ unsigned int modindex, versindex, infoindex, pcpuindex;
+ unsigned int unwindex = 0;
+ unsigned int num_kp, num_mcount;
+ struct kernel_param *kp;
+ struct module *mod;
+ long err = 0;
+ void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+ unsigned long *mseg;
+ mm_segment_t old_fs;
+
+ DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+ if (len < sizeof(*hdr))
+ return ERR_PTR(-ENOEXEC);
+
+ /* Suck in entire file: we'll want most of it. */
+ /* vmalloc barfs on "unusual" numbers. Check here */
+ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (copy_from_user(hdr, umod, len) != 0) {
+ err = -EFAULT;
+ goto free_hdr;
+ }
+
+ /* Sanity checks against insmoding binaries or wrong arch,
+ weird elf version */
+ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
+ || hdr->e_type != ET_REL
+ || !elf_check_arch(hdr)
+ || hdr->e_shentsize != sizeof(*sechdrs)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
+ goto truncated;
+
+ /* Convenience variables */
+ sechdrs = (void *)hdr + hdr->e_shoff;
+ secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+ sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_NOBITS
+ && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
+ goto truncated;
+
+ /* Mark all sections sh_addr with their address in the
+ temporary image. */
+ sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+
+ /* Internal symbols and strings. */
+ if (sechdrs[i].sh_type == SHT_SYMTAB) {
+ symindex = i;
+ strindex = sechdrs[i].sh_link;
+ strtab = (char *)hdr + sechdrs[strindex].sh_offset;
+ }
+#ifndef CONFIG_MODULE_UNLOAD
+ /* Don't load .exit sections */
+ if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+ sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#endif
+ }
+
+ modindex = find_sec(hdr, sechdrs, secstrings,
+ ".gnu.linkonce.this_module");
+ if (!modindex) {
+ printk(KERN_WARNING "No module found in object\n");
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+ /* This is temporary: point mod into copy of data. */
+ mod = (void *)sechdrs[modindex].sh_addr;
+
+ if (symindex == 0) {
+ printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
+ mod->name);
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
+ infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
+ pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+#ifdef ARCH_UNWIND_SECTION_NAME
+ unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
+#endif
+
+ /* Don't keep modinfo and version sections. */
+ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#ifdef CONFIG_KALLSYMS
+ /* Keep symbol and string tables for decoding later. */
+ sechdrs[symindex].sh_flags |= SHF_ALLOC;
+ sechdrs[strindex].sh_flags |= SHF_ALLOC;
+#endif
+ if (unwindex)
+ sechdrs[unwindex].sh_flags |= SHF_ALLOC;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(sechdrs, versindex, mod)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ err = try_to_force_load(mod, "magic");
+ if (err)
+ goto free_hdr;
+ } else if (!same_magic(modmagic, vermagic, versindex)) {
+ printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+ mod->name, modmagic, vermagic);
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ staging = get_modinfo(sechdrs, infoindex, "staging");
+ if (staging) {
+ add_taint_module(mod, TAINT_CRAP);
+ printk(KERN_WARNING "%s: module is from the staging directory,"
+ " the quality is unknown, you have been warned.\n",
+ mod->name);
+ }
+
+ /* Now copy in args */
+ args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(args)) {
+ err = PTR_ERR(args);
+ goto free_hdr;
+ }
+
+ if (find_module(mod->name)) {
+ err = -EEXIST;
+ goto free_mod;
+ }
+
+ mod->state = MODULE_STATE_COMING;
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
+ if (err < 0)
+ goto free_mod;
+
+ if (pcpuindex) {
+ /* We have a special allocation for this section. */
+ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+ sechdrs[pcpuindex].sh_addralign,
+ mod->name);
+ if (!percpu) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+ sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ mod->percpu = percpu;
+ }
+
+ /* Determine total sizes, and put offsets in sh_entsize. For now
+ this is done generically; there doesn't appear to be any
+ special cases for the architectures. */
+ layout_sections(mod, hdr, sechdrs, secstrings);
+
+ /* Do the allocs. */
+ ptr = module_alloc_update_bounds(mod->core_size);
+ if (!ptr) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ memset(ptr, 0, mod->core_size);
+ mod->module_core = ptr;
+
+ ptr = module_alloc_update_bounds(mod->init_size);
+ if (!ptr && mod->init_size) {
+ err = -ENOMEM;
+ goto free_core;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ DEBUGP("final section addresses:\n");
+ for (i = 0; i < hdr->e_shnum; i++) {
+ void *dest;
+
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+ dest = mod->module_init
+ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+ else
+ dest = mod->module_core + sechdrs[i].sh_entsize;
+
+ if (sechdrs[i].sh_type != SHT_NOBITS)
+ memcpy(dest, (void *)sechdrs[i].sh_addr,
+ sechdrs[i].sh_size);
+ /* Update sh_addr to point to copy in image. */
+ sechdrs[i].sh_addr = (unsigned long)dest;
+ DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
+ }
+ /* Module has been moved. */
+ mod = (void *)sechdrs[modindex].sh_addr;
+
+ /* Now we've moved module, initialize linked lists, etc. */
+ module_unload_init(mod);
+
+ /* add kobject, so we can reference it. */
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up license info based on the info section */
+ set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE);
+
+ /* driverloader was caught wrongly pretending to be under GPL */
+ if (strcmp(mod->name, "driverloader") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, sechdrs, infoindex);
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
+ mod);
+ if (err < 0)
+ goto cleanup;
+
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
+ &num_kp);
+ mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
+ mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_gpl_future");
+
+#ifdef CONFIG_UNUSED_SYMBOLS
+ mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused_gpl");
+#endif
+
+#ifdef CONFIG_MARKERS
+ mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
+ sizeof(*mod->markers), &mod->num_markers);
+#endif
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
+ "__tracepoints",
+ sizeof(*mod->tracepoints),
+ &mod->num_tracepoints);
+#endif
+
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !mod->crcs)
+ || (mod->num_gpl_syms && !mod->gpl_crcs)
+ || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
+#ifdef CONFIG_UNUSED_SYMBOLS
+ || (mod->num_unused_syms && !mod->unused_crcs)
+ || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
+#endif
+ ) {
+ printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
+ err = try_to_force_load(mod, "nocrc");
+ if (err)
+ goto cleanup;
+ }
+#endif
+
+ /* Now do relocations. */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ const char *strtab = (char *)sechdrs[strindex].sh_addr;
+ unsigned int info = sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (info >= hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(sechdrs[info].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(sechdrs, strtab, symindex, i,mod);
+ else if (sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(sechdrs, strtab, symindex, i,
+ mod);
+ if (err < 0)
+ goto cleanup;
+ }
+
+ /* Find duplicate symbols */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto cleanup;
+
+ /* Set up and sort exception table */
+ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Finally, copy percpu area over. */
+ percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+ sechdrs[pcpuindex].sh_size);
+
+ add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+
+ if (!mod->taints) {
+ struct mod_debug *debug;
+ unsigned int num_debug;
+
+#ifdef CONFIG_MARKERS
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+#endif
+ debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
+ sizeof(*debug), &num_debug);
+ dynamic_printk_setup(debug, num_debug);
+
+#ifdef CONFIG_TRACEPOINTS
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+#endif
+ }
+
+ /* sechdrs[0].sh_size is always zero */
+ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
+ sizeof(*mseg), &num_mcount);
+ ftrace_init_module(mseg, mseg + num_mcount);
+
+ err = module_finalize(hdr, sechdrs, mod);
+ if (err < 0)
+ goto cleanup;
+
+ /* flush the icache in correct context */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ if (mod->module_init)
+ flush_icache_range((unsigned long)mod->module_init,
+ (unsigned long)mod->module_init
+ + mod->init_size);
+ flush_icache_range((unsigned long)mod->module_core,
+ (unsigned long)mod->module_core + mod->core_size);
+
+ set_fs(old_fs);
+
+ mod->args = args;
+ if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ mod->name);
+
+ /* Now sew it into the lists so we can get lockdep and oops
+ * info during argument parsing. Noone should access us, since
+ * strong_try_module_get() will fail.
+ * lockdep/oops can run asynchronous, so use the RCU list insertion
+ * function to insert in a way safe to concurrent readers.
+ * The mutex protects against concurrent writers.
+ */
+ list_add_rcu(&mod->list, &modules);
+
+ err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+ if (err < 0)
+ goto unlink;
+
+ err = mod_sysfs_setup(mod, kp, num_kp);
+ if (err < 0)
+ goto unlink;
+ add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+ add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+
+ /* Size of section 0 is 0, so this works well if no unwind info. */
+ mod->unwind_info = unwind_add_table(mod,
+ (void *)sechdrs[unwindex].sh_addr,
+ sechdrs[unwindex].sh_size);
+
+ /* Get rid of temporary copy */
+ vfree(hdr);
+
+ /* Done! */
+ return mod;
+
+ unlink:
+ stop_machine(__unlink_module, mod, NULL);
+ module_arch_cleanup(mod);
+ cleanup:
+ kobject_del(&mod->mkobj.kobj);
+ kobject_put(&mod->mkobj.kobj);
+ ftrace_release(mod->module_core, mod->core_size);
+ free_unload:
+ module_unload_free(mod);
+ module_free(mod, mod->module_init);
+ free_core:
+ module_free(mod, mod->module_core);
+ free_percpu:
+ if (percpu)
+ percpu_modfree(percpu);
+ free_mod:
+ kfree(args);
+ free_hdr:
+ vfree(hdr);
+ return ERR_PTR(err);
+
+ truncated:
+ printk(KERN_ERR "Module len %lu truncated\n", len);
+ err = -ENOEXEC;
+ goto free_hdr;
+}
+
+/* This is where the real work happens */
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+ unsigned long, len, const char __user *, uargs)
+{
+ struct module *mod;
+ int ret = 0;
+
+ /* Must have permission */
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+
+ /* Only one module load at a time, please */
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ /* Do all the hard work */
+ mod = load_module(umod, len, uargs);
+ if (IS_ERR(mod)) {
+ mutex_unlock(&module_mutex);
+ return PTR_ERR(mod);
+ }
+
+ /* Drop lock so they can recurse */
+ mutex_unlock(&module_mutex);
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ /* Init routine failed: abort. Try to protect us from
+ buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ mutex_lock(&module_mutex);
+ free_module(mod);
+ mutex_unlock(&module_mutex);
+ wake_up(&module_wq);
+ return ret;
+ }
+ if (ret > 0) {
+ printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+ "it should follow 0/-E convention\n"
+ KERN_WARNING "%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! Wake up anyone waiting for it. */
+ mod->state = MODULE_STATE_LIVE;
+ wake_up(&module_wq);
+
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ unwind_remove_table(mod->unwind_info, 1);
+ module_free(mod, mod->module_init);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_text_size = 0;
+ mutex_unlock(&module_mutex);
+
+ return 0;
+}
+
+static inline int within(unsigned long addr, void *start, unsigned long size)
+{
+ return ((void *)addr >= start && (void *)addr < start + size);
+}
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+ return str[0] == '$' && strchr("atd", str[1])
+ && (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *get_ksymbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval;
+
+ /* At worse, next value is at end of module */
+ if (within(addr, mod->module_init, mod->init_size))
+ nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ else
+ nextval = (unsigned long)mod->module_core+mod->core_text_size;
+
+ /* Scan for closest preceeding symbol, and next symbol. (ELF
+ starts real symbols at 1). */
+ for (i = 1; i < mod->num_symtab; i++) {
+ if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ continue;
+
+ /* We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim. */
+ if (mod->symtab[i].st_value <= addr
+ && mod->symtab[i].st_value > mod->symtab[best].st_value
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ best = i;
+ if (mod->symtab[i].st_value > addr
+ && mod->symtab[i].st_value < nextval
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ nextval = mod->symtab[i].st_value;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - mod->symtab[best].st_value;
+ if (offset)
+ *offset = addr - mod->symtab[best].st_value;
+ return mod->strtab + mod->symtab[best].st_name;
+}
+
+/* For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption. */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ char *namebuf)
+{
+ struct module *mod;
+ const char *ret = NULL;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size)
+ || within(addr, mod->module_core, mod->core_size)) {
+ if (modname)
+ *modname = mod->name;
+ ret = get_ksymbol(mod, addr, size, offset);
+ break;
+ }
+ }
+ /* Make a copy in here where it's safe */
+ if (ret) {
+ strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ ret = namebuf;
+ }
+ preempt_enable();
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+ strlcpy(symname, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strlcpy(modname, mod->name, MODULE_NAME_LEN);
+ if (name)
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (symnum < mod->num_symtab) {
+ *value = mod->symtab[symnum].st_value;
+ *type = mod->symtab[symnum].st_info;
+ strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+ KSYM_NAME_LEN);
+ strlcpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, mod);
+ preempt_enable();
+ return 0;
+ }
+ symnum -= mod->num_symtab;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+static unsigned long mod_find_symname(struct module *mod, const char *name)
+{
+ unsigned int i;
+
+ for (i = 0; i < mod->num_symtab; i++)
+ if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
+ mod->symtab[i].st_info != 'U')
+ return mod->symtab[i].st_value;
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+ unsigned long ret = 0;
+
+ /* Don't lock: we're in enough trouble already. */
+ preempt_disable();
+ if ((colon = strchr(name, ':')) != NULL) {
+ *colon = '\0';
+ if ((mod = find_module(name)) != NULL)
+ ret = mod_find_symname(mod, colon+1);
+ *colon = ':';
+ } else {
+ list_for_each_entry_rcu(mod, &modules, list)
+ if ((ret = mod_find_symname(mod, name)) != 0)
+ break;
+ }
+ preempt_enable();
+ return ret;
+}
+#endif /* CONFIG_KALLSYMS */
+
+static char *module_flags(struct module *mod, char *buf)
+{
+ int bx = 0;
+
+ if (mod->taints ||
+ mod->state == MODULE_STATE_GOING ||
+ mod->state == MODULE_STATE_COMING) {
+ buf[bx++] = '(';
+ if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+ buf[bx++] = 'P';
+ if (mod->taints & (1 << TAINT_FORCED_MODULE))
+ buf[bx++] = 'F';
+ if (mod->taints & (1 << TAINT_CRAP))
+ buf[bx++] = 'C';
+ /*
+ * TAINT_FORCED_RMMOD: could be added.
+ * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * apply to modules.
+ */
+
+ /* Show a - for module-is-being-unloaded */
+ if (mod->state == MODULE_STATE_GOING)
+ buf[bx++] = '-';
+ /* Show a + for module-is-being-loaded */
+ if (mod->state == MODULE_STATE_COMING)
+ buf[bx++] = '+';
+ buf[bx++] = ')';
+ }
+ buf[bx] = '\0';
+
+ return buf;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[8];
+
+ seq_printf(m, "%s %u",
+ mod->name, mod->init_size + mod->core_size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading":
+ mod->state == MODULE_STATE_COMING ? "Loading":
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ seq_printf(m, " 0x%p", mod->module_core);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf));
+
+ seq_printf(m, "\n");
+ return 0;
+}
+
+/* Format: modulename size refcount deps address
+
+ Where refcount is a number or -, and deps is a comma-separated list
+ of depends or -.
+*/
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+static int modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &modules_op);
+}
+
+static const struct file_operations proc_modules_operations = {
+ .open = modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &proc_modules_operations);
+ return 0;
+}
+module_init(proc_modules_init);
+#endif
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ const struct exception_table_entry *e = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->num_exentries == 0)
+ continue;
+
+ e = search_extable(mod->extable,
+ mod->extable + mod->num_exentries - 1,
+ addr);
+ if (e)
+ break;
+ }
+ preempt_enable();
+
+ /* Now, if we found one, we are running inside it now, hence
+ we cannot unload the module, hence no refcnt needed. */
+ return e;
+}
+
+/*
+ * Is this a valid module address?
+ */
+int is_module_address(unsigned long addr)
+{
+ struct module *mod;
+
+ preempt_disable();
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within(addr, mod->module_core, mod->core_size)) {
+ preempt_enable();
+ return 1;
+ }
+ }
+
+ preempt_enable();
+
+ return 0;
+}
+
+
+/* Is this a valid kernel address? */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod;
+
+ if (addr < module_addr_min || addr > module_addr_max)
+ return NULL;
+
+ list_for_each_entry_rcu(mod, &modules, list)
+ if (within(addr, mod->module_init, mod->init_text_size)
+ || within(addr, mod->module_core, mod->core_text_size))
+ return mod;
+ return NULL;
+}
+
+struct module *module_text_address(unsigned long addr)
+{
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_text_address(addr);
+ preempt_enable();
+
+ return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+ char buf[8];
+
+ printk("Modules linked in:");
+ /* Most callers should already have preempt disabled, but make sure */
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list)
+ printk(" %s%s", mod->name, module_flags(mod, buf));
+ preempt_enable();
+ if (last_unloaded_module[0])
+ printk(" [last unloaded: %s]", last_unloaded_module);
+ printk("\n");
+}
+
+#ifdef CONFIG_MODVERSIONS
+/* Generate the signature for struct module here, too, for modversions. */
+void struct_module(struct module *mod) { return; }
+EXPORT_SYMBOL(struct_module);
+#endif
+
+#ifdef CONFIG_MARKERS
+void module_update_markers(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list)
+ if (!mod->taints)
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers);
+ mutex_unlock(&module_mutex);
+}
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+void module_update_tracepoints(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list)
+ if (!mod->taints)
+ tracepoint_update_probe_range(mod->tracepoints,
+ mod->tracepoints + mod->num_tracepoints);
+ mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+ struct module *iter_mod;
+ int found = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(iter_mod, &modules, list) {
+ if (!iter_mod->taints) {
+ /*
+ * Sorted module list
+ */
+ if (iter_mod < iter->module)
+ continue;
+ else if (iter_mod > iter->module)
+ iter->tracepoint = NULL;
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ iter_mod->tracepoints,
+ iter_mod->tracepoints
+ + iter_mod->num_tracepoints);
+ if (found) {
+ iter->module = iter_mod;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&module_mutex);
+ return found;
+}
+#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
new file mode 100644
index 0000000..1d94160
--- /dev/null
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,116 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+#include "mutex-debug.h"
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
+{
+ lock->owner = new_owner;
+}
+
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
+ waiter->magic = waiter;
+ INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+ DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
+ memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
+ /* Mark the current thread as blocked on the lock: */
+ ti->task->blocked_on = waiter;
+ waiter->lock = lock;
+}
+
+void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti)
+{
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+ DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
+ ti->task->blocked_on = NULL;
+
+ list_del_init(&waiter->list);
+ waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->owner = NULL;
+ lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void mutex_destroy(struct mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
+ lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
new file mode 100644
index 0000000..babfbdf
--- /dev/null
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,53 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void
+debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
+
+static inline void debug_mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct thread_info *ti);
+extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+
+#define spin_lock_mutex(lock, flags) \
+ do { \
+ struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+ \
+ DEBUG_LOCKS_WARN_ON(in_interrupt()); \
+ local_irq_save(flags); \
+ __raw_spin_lock(&(lock)->raw_lock); \
+ DEBUG_LOCKS_WARN_ON(l->magic != l); \
+ } while (0)
+
+#define spin_unlock_mutex(lock, flags) \
+ do { \
+ __raw_spin_unlock(&(lock)->raw_lock); \
+ local_irq_restore(flags); \
+ preempt_check_resched(); \
+ } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
new file mode 100644
index 0000000..12c779d
--- /dev/null
+++ b/kernel/mutex.c
@@ -0,0 +1,387 @@
+/*
+ * kernel/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * Also see Documentation/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+/***
+ * mutex_init - initialize the mutex
+ * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
+ *
+ * Initialize the mutex to unlocked state.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ atomic_set(&lock->count, 1);
+ spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list);
+
+ debug_mutex_init(lock, name, key);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+static void noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides mutex must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void inline __sched mutex_lock(struct mutex *lock)
+{
+ might_sleep();
+ /*
+ * The locking fastpath is the 1->0 transition from
+ * 'unlocked' into 'locked' state.
+ */
+ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+#endif
+
+static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void __sched mutex_unlock(struct mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+ unsigned long ip)
+{
+ struct task_struct *task = current;
+ struct mutex_waiter waiter;
+ unsigned int old_val;
+ unsigned long flags;
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ debug_mutex_lock_common(lock, &waiter);
+ mutex_acquire(&lock->dep_map, subclass, 0, ip);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ list_add_tail(&waiter.list, &lock->wait_list);
+ waiter.task = task;
+
+ old_val = atomic_xchg(&lock->count, -1);
+ if (old_val == 1)
+ goto done;
+
+ lock_contended(&lock->dep_map, ip);
+
+ for (;;) {
+ /*
+ * Lets try to take the lock again - this is needed even if
+ * we get here for the first time (shortly after failing to
+ * acquire the lock), to make sure that we get a wakeup once
+ * it's unlocked. Later on, if we sleep, this is the
+ * operation that gives us the lock. We xchg it to -1, so
+ * that when we release the lock, we properly wake up the
+ * other waiters:
+ */
+ old_val = atomic_xchg(&lock->count, -1);
+ if (old_val == 1)
+ break;
+
+ /*
+ * got a signal? (This code gets eliminated in the
+ * TASK_UNINTERRUPTIBLE case.)
+ */
+ if (unlikely(signal_pending_state(state, task))) {
+ mutex_remove_waiter(lock, &waiter,
+ task_thread_info(task));
+ mutex_release(&lock->dep_map, 1, ip);
+ spin_unlock_mutex(&lock->wait_lock, flags);
+
+ debug_mutex_free_waiter(&waiter);
+ return -EINTR;
+ }
+ __set_task_state(task, state);
+
+ /* didnt get the lock, go to sleep: */
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ schedule();
+ spin_lock_mutex(&lock->wait_lock, flags);
+ }
+
+done:
+ lock_acquired(&lock->dep_map);
+ /* got the lock - rejoice! */
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_set_owner(lock, task_thread_info(task));
+
+ /* set it to 0 if there are no waiters left: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+
+ debug_mutex_free_waiter(&waiter);
+
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+int __sched
+mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+#endif
+
+/*
+ * Release the lock, slowpath:
+ */
+static inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
+
+ /*
+ * some architectures leave the lock unlocked in the fastpath failure
+ * case, others need to leave it locked. In the later case we have to
+ * unlock it here
+ */
+ if (__mutex_slowpath_needs_to_unlock())
+ atomic_set(&lock->count, 1);
+
+ if (!list_empty(&lock->wait_list)) {
+ /* get the first entry from the wait-list: */
+ struct mutex_waiter *waiter =
+ list_entry(lock->wait_list.next,
+ struct mutex_waiter, list);
+
+ debug_mutex_wake_waiter(lock, waiter);
+
+ wake_up_process(waiter->task);
+ }
+
+ debug_mutex_clear_owner(lock);
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+static noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+ __mutex_unlock_common_slowpath(lock_count, 1);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static noinline int __sched
+__mutex_lock_killable_slowpath(atomic_t *lock_count);
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
+
+/***
+ * mutex_lock_interruptible - acquire the mutex, interruptable
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ might_sleep();
+ return __mutex_fastpath_lock_retval
+ (&lock->count, __mutex_lock_interruptible_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ might_sleep();
+ return __mutex_fastpath_lock_retval
+ (&lock->count, __mutex_lock_killable_slowpath);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+static noinline void __sched
+__mutex_lock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+}
+
+static noinline int __sched
+__mutex_lock_killable_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+}
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+}
+#endif
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
+ int prev;
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ prev = atomic_xchg(&lock->count, -1);
+ if (likely(prev == 1)) {
+ debug_mutex_set_owner(lock, current_thread_info());
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+ /* Set it back to 0 if there are no waiters: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+
+ return prev == 1;
+}
+
+/***
+ * mutex_trylock - try acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int __sched mutex_trylock(struct mutex *lock)
+{
+ return __mutex_fastpath_trylock(&lock->count,
+ __mutex_trylock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
new file mode 100644
index 0000000..a075daf
--- /dev/null
+++ b/kernel/mutex.h
@@ -0,0 +1,30 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define spin_lock_mutex(lock, flags) \
+ do { spin_lock(lock); (void)(flags); } while (0)
+#define spin_unlock_mutex(lock, flags) \
+ do { spin_unlock(lock); (void)(flags); } while (0)
+#define mutex_remove_waiter(lock, waiter, ti) \
+ __list_del((waiter)->list.prev, (waiter)->list.next)
+
+#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
+#define debug_mutex_clear_owner(lock) do { } while (0)
+#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+#define debug_mutex_free_waiter(waiter) do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+#define debug_mutex_unlock(lock) do { } while (0)
+#define debug_mutex_init(lock, name, key) do { } while (0)
+
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
new file mode 100644
index 0000000..4282c0a
--- /dev/null
+++ b/kernel/notifier.c
@@ -0,0 +1,578 @@
+#include <linux/kdebug.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * at shutdown. This is used to stop any idling DMA operations
+ * and the like.
+ */
+BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ * Notifier chain core routines. The exported routines below
+ * are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if (n->priority > (*nl)->priority)
+ break;
+ nl = &((*nl)->next);
+ }
+ n->next = *nl;
+ rcu_assign_pointer(*nl, n);
+ return 0;
+}
+
+static int notifier_chain_cond_register(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if ((*nl) == n)
+ return 0;
+ if (n->priority > (*nl)->priority)
+ break;
+ nl = &((*nl)->next);
+ }
+ n->next = *nl;
+ rcu_assign_pointer(*nl, n);
+ return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if ((*nl) == n) {
+ rcu_assign_pointer(*nl, n->next);
+ return 0;
+ }
+ nl = &((*nl)->next);
+ }
+ return -ENOENT;
+}
+
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: Number of notifier functions to be called. Don't care
+ * value of this parameter is -1.
+ * @nr_calls: Records the number of notifications sent. Don't care
+ * value of this field is NULL.
+ * @returns: notifier_call_chain returns the value returned by the
+ * last notifier function called.
+ */
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret = NOTIFY_DONE;
+ struct notifier_block *nb, *next_nb;
+
+ nb = rcu_dereference(*nl);
+
+ while (nb && nr_to_call) {
+ next_nb = rcu_dereference(nb->next);
+ ret = nb->notifier_call(nb, val, v);
+
+ if (nr_calls)
+ (*nr_calls)++;
+
+ if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ break;
+ nb = next_nb;
+ nr_to_call--;
+ }
+ return ret;
+}
+
+/*
+ * Atomic notifier chain routines. Registration and unregistration
+ * use a spinlock, and call_chain is synchronized by RCU (no locks).
+ */
+
+/**
+ * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain.
+ *
+ * Currently always returns zero.
+ */
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an atomic notifier chain.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_unregister(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ synchronize_rcu();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See the comment for notifier_call_chain.
+ * @nr_calls: See the comment for notifier_call_chain.
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an atomic context, so they must not block.
+ * This routine uses RCU to synchronize with changes to the chain.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ * Blocking notifier chain routines. All access to the chain is
+ * synchronized by an rwsem.
+ */
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_register(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
+
+/**
+ * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain, only if not already
+ * present in the chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_cond_register(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+
+/**
+ * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a blocking notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_unregister(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain.
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret = NOTIFY_DONE;
+
+ /*
+ * We check the head outside the lock, but if this access is
+ * racy then it does not matter what the result of the test
+ * is, we re-check the list after having taken the lock anyway:
+ */
+ if (rcu_dereference(nh->head)) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+ nr_calls);
+ up_read(&nh->rwsem);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ * Raw notifier chain routines. There is no protection;
+ * the caller must provide it. Use at your own risk!
+ */
+
+/**
+ * raw_notifier_chain_register - Add notifier to a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Currently always returns zero.
+ */
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_register(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_unregister(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ * __raw_notifier_call_chain - Call functions in a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an undefined context.
+ * All locking must be provided by the caller.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+
+/*
+ * SRCU notifier chain routines. Registration and unregistration
+ * use a mutex, and call_chain is synchronized by SRCU (no locks).
+ */
+
+/**
+ * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an SRCU notifier chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_register(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
+
+/**
+ * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an SRCU notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_unregister(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ synchronize_srcu(&nh->srcu);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
+
+/**
+ * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret;
+ int idx;
+
+ idx = srcu_read_lock(&nh->srcu);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ srcu_read_unlock(&nh->srcu, idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
+
+/**
+ * srcu_init_notifier_head - Initialize an SRCU notifier head
+ * @nh: Pointer to head of the srcu notifier chain
+ *
+ * Unlike other sorts of notifier heads, SRCU notifier heads require
+ * dynamic initialization. Be sure to call this routine before
+ * calling any of the other SRCU notifier routines for this head.
+ *
+ * If an SRCU notifier head is deallocated, it must first be cleaned
+ * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
+ * per-cpu data (used by the SRCU mechanism) will leak.
+ */
+void srcu_init_notifier_head(struct srcu_notifier_head *nh)
+{
+ mutex_init(&nh->mutex);
+ if (init_srcu_struct(&nh->srcu) < 0)
+ BUG();
+ nh->head = NULL;
+}
+EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+
+/**
+ * register_reboot_notifier - Register function to be called at reboot time
+ * @nb: Info about notifier function to be called
+ *
+ * Registers a function with the list of functions
+ * to be called at reboot time.
+ *
+ * Currently always returns zero, as blocking_notifier_chain_register()
+ * always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ * unregister_reboot_notifier - Unregister previously registered reboot notifier
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered reboot
+ * notifier function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notrace notify_die(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+ return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+
+int register_die_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
new file mode 100644
index 0000000..43c2111
--- /dev/null
+++ b/kernel/ns_cgroup.c
@@ -0,0 +1,106 @@
+/*
+ * ns_cgroup.c - namespace cgroup subsystem
+ *
+ * Copyright 2006, 2007 IBM Corp
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+
+struct ns_cgroup {
+ struct cgroup_subsys_state css;
+ spinlock_t lock;
+};
+
+struct cgroup_subsys ns_subsys;
+
+static inline struct ns_cgroup *cgroup_to_ns(
+ struct cgroup *cgroup)
+{
+ return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
+ struct ns_cgroup, css);
+}
+
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
+{
+ char name[PROC_NUMBUF];
+
+ snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+ return cgroup_clone(task, &ns_subsys, name);
+}
+
+/*
+ * Rules:
+ * 1. you can only enter a cgroup which is a child of your current
+ * cgroup
+ * 2. you can only place another process into a cgroup if
+ * a. you have CAP_SYS_ADMIN
+ * b. your cgroup is an ancestor of task's destination cgroup
+ * (hence either you are in the same cgroup as task, or in an
+ * ancestor cgroup thereof)
+ */
+static int ns_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *new_cgroup, struct task_struct *task)
+{
+ struct cgroup *orig;
+
+ if (current != task) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!cgroup_is_descendant(new_cgroup))
+ return -EPERM;
+ }
+
+ if (atomic_read(&new_cgroup->count) != 0)
+ return -EPERM;
+
+ orig = task_cgroup(task, ns_subsys_id);
+ if (orig && orig != new_cgroup->parent)
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * Rules: you can only create a cgroup if
+ * 1. you are capable(CAP_SYS_ADMIN)
+ * 2. the target cgroup is a descendant of your own cgroup
+ */
+static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ struct ns_cgroup *ns_cgroup;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (!cgroup_is_descendant(cgroup))
+ return ERR_PTR(-EPERM);
+
+ ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
+ if (!ns_cgroup)
+ return ERR_PTR(-ENOMEM);
+ spin_lock_init(&ns_cgroup->lock);
+ return &ns_cgroup->css;
+}
+
+static void ns_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ struct ns_cgroup *ns_cgroup;
+
+ ns_cgroup = cgroup_to_ns(cgroup);
+ kfree(ns_cgroup);
+}
+
+struct cgroup_subsys ns_subsys = {
+ .name = "ns",
+ .can_attach = ns_can_attach,
+ .create = ns_create,
+ .destroy = ns_destroy,
+ .subsys_id = ns_subsys_id,
+};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 0000000..1d3ef29
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * Jun 2006 - namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/init_task.h>
+#include <linux/mnt_namespace.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
+
+static struct kmem_cache *nsproxy_cachep;
+
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+
+/*
+ * creates a copy of "orig" with refcount 1.
+ */
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+{
+ struct nsproxy *ns;
+
+ ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+ if (ns) {
+ memcpy(ns, orig, sizeof(struct nsproxy));
+ atomic_set(&ns->count, 1);
+ }
+ return ns;
+}
+
+/*
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy. Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
+ */
+static struct nsproxy *create_new_namespaces(unsigned long flags,
+ struct task_struct *tsk, struct fs_struct *new_fs)
+{
+ struct nsproxy *new_nsp;
+ int err;
+
+ new_nsp = clone_nsproxy(tsk->nsproxy);
+ if (!new_nsp)
+ return ERR_PTR(-ENOMEM);
+
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ if (IS_ERR(new_nsp->mnt_ns)) {
+ err = PTR_ERR(new_nsp->mnt_ns);
+ goto out_ns;
+ }
+
+ new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ if (IS_ERR(new_nsp->uts_ns)) {
+ err = PTR_ERR(new_nsp->uts_ns);
+ goto out_uts;
+ }
+
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ if (IS_ERR(new_nsp->ipc_ns)) {
+ err = PTR_ERR(new_nsp->ipc_ns);
+ goto out_ipc;
+ }
+
+ new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+ if (IS_ERR(new_nsp->pid_ns)) {
+ err = PTR_ERR(new_nsp->pid_ns);
+ goto out_pid;
+ }
+
+ new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
+ if (IS_ERR(new_nsp->user_ns)) {
+ err = PTR_ERR(new_nsp->user_ns);
+ goto out_user;
+ }
+
+ new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
+ if (IS_ERR(new_nsp->net_ns)) {
+ err = PTR_ERR(new_nsp->net_ns);
+ goto out_net;
+ }
+
+ return new_nsp;
+
+out_net:
+ if (new_nsp->user_ns)
+ put_user_ns(new_nsp->user_ns);
+out_user:
+ if (new_nsp->pid_ns)
+ put_pid_ns(new_nsp->pid_ns);
+out_pid:
+ if (new_nsp->ipc_ns)
+ put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+ if (new_nsp->uts_ns)
+ put_uts_ns(new_nsp->uts_ns);
+out_uts:
+ if (new_nsp->mnt_ns)
+ put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+ kmem_cache_free(nsproxy_cachep, new_nsp);
+ return ERR_PTR(err);
+}
+
+/*
+ * called from clone. This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+{
+ struct nsproxy *old_ns = tsk->nsproxy;
+ struct nsproxy *new_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_nsproxy(old_ns);
+
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ /*
+ * CLONE_NEWIPC must detach from the undolist: after switching
+ * to a new ipc namespace, the semaphore arrays from the old
+ * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
+ * means share undolist with parent, so we must forbid using
+ * it along with CLONE_NEWIPC.
+ */
+ if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ if (IS_ERR(new_ns)) {
+ err = PTR_ERR(new_ns);
+ goto out;
+ }
+
+ tsk->nsproxy = new_ns;
+
+out:
+ put_nsproxy(old_ns);
+ return err;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+ if (ns->mnt_ns)
+ put_mnt_ns(ns->mnt_ns);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ put_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns)
+ put_pid_ns(ns->pid_ns);
+ if (ns->user_ns)
+ put_user_ns(ns->user_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On success, returns the new nsproxy.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+ struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+ int err = 0;
+
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWUSER | CLONE_NEWNET)))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_nsp = create_new_namespaces(unshare_flags, current,
+ new_fs ? new_fs : current->fs);
+ if (IS_ERR(*new_nsp)) {
+ err = PTR_ERR(*new_nsp);
+ goto out;
+ }
+
+ err = ns_cgroup_clone(current, task_pid(current));
+ if (err)
+ put_nsproxy(*new_nsp);
+
+out:
+ return err;
+}
+
+void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
+{
+ struct nsproxy *ns;
+
+ might_sleep();
+
+ ns = p->nsproxy;
+
+ rcu_assign_pointer(p->nsproxy, new);
+
+ if (ns && atomic_dec_and_test(&ns->count)) {
+ /*
+ * wait for others to get what they want from this nsproxy.
+ *
+ * cannot release this nsproxy via the call_rcu() since
+ * put_mnt_ns() will want to sleep
+ */
+ synchronize_rcu();
+ free_nsproxy(ns);
+ }
+}
+
+void exit_task_namespaces(struct task_struct *p)
+{
+ switch_task_namespaces(p, NULL);
+}
+
+static int __init nsproxy_cache_init(void)
+{
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ return 0;
+}
+
+module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 0000000..4d50883
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,376 @@
+/*
+ * linux/kernel/panic.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+#include <linux/kexec.h>
+#include <linux/debug_locks.h>
+#include <linux/random.h>
+#include <linux/kallsyms.h>
+
+int panic_on_oops;
+static unsigned long tainted_mask;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
+
+int panic_timeout;
+
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
+
+EXPORT_SYMBOL(panic_notifier_list);
+
+static long no_blink(long time)
+{
+ return 0;
+}
+
+/* Returns how long it waited in ms */
+long (*panic_blink)(long time);
+EXPORT_SYMBOL(panic_blink);
+
+/**
+ * panic - halt the system
+ * @fmt: The text string to print
+ *
+ * Display a message, then perform cleanups.
+ *
+ * This function never returns.
+ */
+
+NORET_TYPE void panic(const char * fmt, ...)
+{
+ long i;
+ static char buf[1024];
+ va_list args;
+#if defined(CONFIG_S390)
+ unsigned long caller = (unsigned long) __builtin_return_address(0);
+#endif
+
+ /*
+ * It's possible to come here directly from a panic-assertion and not
+ * have preempt disabled. Some functions called from here want
+ * preempt to be disabled. No point enabling it later though...
+ */
+ preempt_disable();
+
+ bust_spinlocks(1);
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ bust_spinlocks(0);
+
+ /*
+ * If we have crashed and we have a crash kernel loaded let it handle
+ * everything else.
+ * Do we want to call this before we try to display a message?
+ */
+ crash_kexec(NULL);
+
+#ifdef CONFIG_SMP
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
+ smp_send_stop();
+#endif
+
+ atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+
+ if (!panic_blink)
+ panic_blink = no_blink;
+
+ if (panic_timeout > 0) {
+ /*
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked..
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+ for (i = 0; i < panic_timeout*1000; ) {
+ touch_nmi_watchdog();
+ i += panic_blink(i);
+ mdelay(1);
+ i++;
+ }
+ /* This will not be a clean reboot, with everything
+ * shutting down. But if there is a chance of
+ * rebooting the system it will be rebooted.
+ */
+ emergency_restart();
+ }
+#ifdef __sparc__
+ {
+ extern int stop_a_enabled;
+ /* Make sure the user can actually press Stop-A (L1-A) */
+ stop_a_enabled = 1;
+ printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+ }
+#endif
+#if defined(CONFIG_S390)
+ disabled_wait(caller);
+#endif
+ local_irq_enable();
+ for (i = 0;;) {
+ touch_softlockup_watchdog();
+ i += panic_blink(i);
+ mdelay(1);
+ i++;
+ }
+}
+
+EXPORT_SYMBOL(panic);
+
+
+struct tnt {
+ u8 bit;
+ char true;
+ char false;
+};
+
+static const struct tnt tnts[] = {
+ { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
+ { TAINT_FORCED_MODULE, 'F', ' ' },
+ { TAINT_UNSAFE_SMP, 'S', ' ' },
+ { TAINT_FORCED_RMMOD, 'R', ' ' },
+ { TAINT_MACHINE_CHECK, 'M', ' ' },
+ { TAINT_BAD_PAGE, 'B', ' ' },
+ { TAINT_USER, 'U', ' ' },
+ { TAINT_DIE, 'D', ' ' },
+ { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
+ { TAINT_WARN, 'W', ' ' },
+ { TAINT_CRAP, 'C', ' ' },
+};
+
+/**
+ * print_tainted - return a string to represent the kernel taint state.
+ *
+ * 'P' - Proprietary module has been loaded.
+ * 'F' - Module has been forcibly loaded.
+ * 'S' - SMP with CPUs not designed for SMP.
+ * 'R' - User forced a module unload.
+ * 'M' - System experienced a machine check exception.
+ * 'B' - System has hit bad_page.
+ * 'U' - Userspace-defined naughtiness.
+ * 'D' - Kernel has oopsed before
+ * 'A' - ACPI table overridden.
+ * 'W' - Taint on warning.
+ * 'C' - modules from drivers/staging are loaded.
+ *
+ * The string is overwritten by the next call to print_taint().
+ */
+const char *print_tainted(void)
+{
+ static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
+
+ if (tainted_mask) {
+ char *s;
+ int i;
+
+ s = buf + sprintf(buf, "Tainted: ");
+ for (i = 0; i < ARRAY_SIZE(tnts); i++) {
+ const struct tnt *t = &tnts[i];
+ *s++ = test_bit(t->bit, &tainted_mask) ?
+ t->true : t->false;
+ }
+ *s = 0;
+ } else
+ snprintf(buf, sizeof(buf), "Not tainted");
+ return(buf);
+}
+
+int test_taint(unsigned flag)
+{
+ return test_bit(flag, &tainted_mask);
+}
+EXPORT_SYMBOL(test_taint);
+
+unsigned long get_taint(void)
+{
+ return tainted_mask;
+}
+
+void add_taint(unsigned flag)
+{
+ debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+ set_bit(flag, &tainted_mask);
+}
+EXPORT_SYMBOL(add_taint);
+
+static void spin_msec(int msecs)
+{
+ int i;
+
+ for (i = 0; i < msecs; i++) {
+ touch_nmi_watchdog();
+ mdelay(1);
+ }
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+ unsigned long flags;
+ static int spin_counter;
+
+ if (!pause_on_oops)
+ return;
+
+ spin_lock_irqsave(&pause_on_oops_lock, flags);
+ if (pause_on_oops_flag == 0) {
+ /* This CPU may now print the oops message */
+ pause_on_oops_flag = 1;
+ } else {
+ /* We need to stall this CPU */
+ if (!spin_counter) {
+ /* This CPU gets to do the counting */
+ spin_counter = pause_on_oops;
+ do {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(MSEC_PER_SEC);
+ spin_lock(&pause_on_oops_lock);
+ } while (--spin_counter);
+ pause_on_oops_flag = 0;
+ } else {
+ /* This CPU waits for a different one */
+ while (spin_counter) {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(1);
+ spin_lock(&pause_on_oops_lock);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info. This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+ return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything. If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option. We do all this
+ * to ensure that oopses don't scroll off the screen. It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+ debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+ do_oops_enter_exit();
+}
+
+/*
+ * 64-bit random ID for oopses:
+ */
+static u64 oops_id;
+
+static int init_oops_id(void)
+{
+ if (!oops_id)
+ get_random_bytes(&oops_id, sizeof(oops_id));
+
+ return 0;
+}
+late_initcall(init_oops_id);
+
+static void print_oops_end_marker(void)
+{
+ init_oops_id();
+ printk(KERN_WARNING "---[ end trace %016llx ]---\n",
+ (unsigned long long)oops_id);
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+ do_oops_enter_exit();
+ print_oops_end_marker();
+}
+
+#ifdef WANT_WARN_ON_SLOWPATH
+void warn_on_slowpath(const char *file, int line)
+{
+ char function[KSYM_SYMBOL_LEN];
+ unsigned long caller = (unsigned long) __builtin_return_address(0);
+ sprint_symbol(function, caller);
+
+ printk(KERN_WARNING "------------[ cut here ]------------\n");
+ printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+ line, function);
+ print_modules();
+ dump_stack();
+ print_oops_end_marker();
+ add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_on_slowpath);
+
+
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+ va_list args;
+ char function[KSYM_SYMBOL_LEN];
+ unsigned long caller = (unsigned long)__builtin_return_address(0);
+ sprint_symbol(function, caller);
+
+ printk(KERN_WARNING "------------[ cut here ]------------\n");
+ printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+ line, function);
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ print_modules();
+ dump_stack();
+ print_oops_end_marker();
+ add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
+#endif
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+ panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
+
+core_param(panic, panic_timeout, int, 0644);
+core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
new file mode 100644
index 0000000..a1e3025
--- /dev/null
+++ b/kernel/params.c
@@ -0,0 +1,761 @@
+/* Helpers for initial module or kernel cmdline parsing
+ Copyright (C) 2001 Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt, a...)
+#endif
+
+static inline char dash2underscore(char c)
+{
+ if (c == '-')
+ return '_';
+ return c;
+}
+
+static inline int parameq(const char *input, const char *paramname)
+{
+ unsigned int i;
+ for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
+ if (input[i] == '\0')
+ return 1;
+ return 0;
+}
+
+static int parse_one(char *param,
+ char *val,
+ struct kernel_param *params,
+ unsigned num_params,
+ int (*handle_unknown)(char *param, char *val))
+{
+ unsigned int i;
+
+ /* Find parameter */
+ for (i = 0; i < num_params; i++) {
+ if (parameq(param, params[i].name)) {
+ DEBUGP("They are equal! Calling %p\n",
+ params[i].set);
+ return params[i].set(val, &params[i]);
+ }
+ }
+
+ if (handle_unknown) {
+ DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+ return handle_unknown(param, val);
+ }
+
+ DEBUGP("Unknown argument `%s'\n", param);
+ return -ENOENT;
+}
+
+/* You can use " around spaces, but can't escape ". */
+/* Hyphens and underscores equivalent in parameter names. */
+static char *next_arg(char *args, char **param, char **val)
+{
+ unsigned int i, equals = 0;
+ int in_quote = 0, quoted = 0;
+ char *next;
+
+ if (*args == '"') {
+ args++;
+ in_quote = 1;
+ quoted = 1;
+ }
+
+ for (i = 0; args[i]; i++) {
+ if (args[i] == ' ' && !in_quote)
+ break;
+ if (equals == 0) {
+ if (args[i] == '=')
+ equals = i;
+ }
+ if (args[i] == '"')
+ in_quote = !in_quote;
+ }
+
+ *param = args;
+ if (!equals)
+ *val = NULL;
+ else {
+ args[equals] = '\0';
+ *val = args + equals + 1;
+
+ /* Don't include quotes in value. */
+ if (**val == '"') {
+ (*val)++;
+ if (args[i-1] == '"')
+ args[i-1] = '\0';
+ }
+ if (quoted && args[i-1] == '"')
+ args[i-1] = '\0';
+ }
+
+ if (args[i]) {
+ args[i] = '\0';
+ next = args + i + 1;
+ } else
+ next = args + i;
+
+ /* Chew up trailing spaces. */
+ while (*next == ' ')
+ next++;
+ return next;
+}
+
+/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
+int parse_args(const char *name,
+ char *args,
+ struct kernel_param *params,
+ unsigned num,
+ int (*unknown)(char *param, char *val))
+{
+ char *param, *val;
+
+ DEBUGP("Parsing ARGS: %s\n", args);
+
+ /* Chew leading spaces */
+ while (*args == ' ')
+ args++;
+
+ while (*args) {
+ int ret;
+ int irq_was_disabled;
+
+ args = next_arg(args, &param, &val);
+ irq_was_disabled = irqs_disabled();
+ ret = parse_one(param, val, params, num, unknown);
+ if (irq_was_disabled && !irqs_disabled()) {
+ printk(KERN_WARNING "parse_args(): option '%s' enabled "
+ "irq's!\n", param);
+ }
+ switch (ret) {
+ case -ENOENT:
+ printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+ name, param);
+ return ret;
+ case -ENOSPC:
+ printk(KERN_ERR
+ "%s: `%s' too large for parameter `%s'\n",
+ name, val ?: "", param);
+ return ret;
+ case 0:
+ break;
+ default:
+ printk(KERN_ERR
+ "%s: `%s' invalid for parameter `%s'\n",
+ name, val ?: "", param);
+ return ret;
+ }
+ }
+
+ /* All parsed OK. */
+ return 0;
+}
+
+/* Lazy bastard, eh? */
+#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
+ int param_set_##name(const char *val, struct kernel_param *kp) \
+ { \
+ tmptype l; \
+ int ret; \
+ \
+ if (!val) return -EINVAL; \
+ ret = strtolfn(val, 0, &l); \
+ if (ret == -EINVAL || ((type)l != l)) \
+ return -EINVAL; \
+ *((type *)kp->arg) = l; \
+ return 0; \
+ } \
+ int param_get_##name(char *buffer, struct kernel_param *kp) \
+ { \
+ return sprintf(buffer, format, *((type *)kp->arg)); \
+ }
+
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
+
+int param_set_charp(const char *val, struct kernel_param *kp)
+{
+ if (!val) {
+ printk(KERN_ERR "%s: string parameter expected\n",
+ kp->name);
+ return -EINVAL;
+ }
+
+ if (strlen(val) > 1024) {
+ printk(KERN_ERR "%s: string parameter too long\n",
+ kp->name);
+ return -ENOSPC;
+ }
+
+ *(char **)kp->arg = (char *)val;
+ return 0;
+}
+
+int param_get_charp(char *buffer, struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s", *((char **)kp->arg));
+}
+
+int param_set_bool(const char *val, struct kernel_param *kp)
+{
+ /* No equals means "set"... */
+ if (!val) val = "1";
+
+ /* One of =[yYnN01] */
+ switch (val[0]) {
+ case 'y': case 'Y': case '1':
+ *(int *)kp->arg = 1;
+ return 0;
+ case 'n': case 'N': case '0':
+ *(int *)kp->arg = 0;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+int param_get_bool(char *buffer, struct kernel_param *kp)
+{
+ /* Y and N chosen as being relatively non-coder friendly */
+ return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+}
+
+int param_set_invbool(const char *val, struct kernel_param *kp)
+{
+ int boolval, ret;
+ struct kernel_param dummy;
+
+ dummy.arg = &boolval;
+ ret = param_set_bool(val, &dummy);
+ if (ret == 0)
+ *(int *)kp->arg = !boolval;
+ return ret;
+}
+
+int param_get_invbool(char *buffer, struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+}
+
+/* We break the rule and mangle the string. */
+static int param_array(const char *name,
+ const char *val,
+ unsigned int min, unsigned int max,
+ void *elem, int elemsize,
+ int (*set)(const char *, struct kernel_param *kp),
+ unsigned int *num)
+{
+ int ret;
+ struct kernel_param kp;
+ char save;
+
+ /* Get the name right for errors. */
+ kp.name = name;
+ kp.arg = elem;
+
+ /* No equals sign? */
+ if (!val) {
+ printk(KERN_ERR "%s: expects arguments\n", name);
+ return -EINVAL;
+ }
+
+ *num = 0;
+ /* We expect a comma-separated list of values. */
+ do {
+ int len;
+
+ if (*num == max) {
+ printk(KERN_ERR "%s: can only take %i arguments\n",
+ name, max);
+ return -EINVAL;
+ }
+ len = strcspn(val, ",");
+
+ /* nul-terminate and parse */
+ save = val[len];
+ ((char *)val)[len] = '\0';
+ ret = set(val, &kp);
+
+ if (ret != 0)
+ return ret;
+ kp.arg += elemsize;
+ val += len+1;
+ (*num)++;
+ } while (save == ',');
+
+ if (*num < min) {
+ printk(KERN_ERR "%s: needs at least %i arguments\n",
+ name, min);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int param_array_set(const char *val, struct kernel_param *kp)
+{
+ const struct kparam_array *arr = kp->arr;
+ unsigned int temp_num;
+
+ return param_array(kp->name, val, 1, arr->max, arr->elem,
+ arr->elemsize, arr->set, arr->num ?: &temp_num);
+}
+
+int param_array_get(char *buffer, struct kernel_param *kp)
+{
+ int i, off, ret;
+ const struct kparam_array *arr = kp->arr;
+ struct kernel_param p;
+
+ p = *kp;
+ for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ if (i)
+ buffer[off++] = ',';
+ p.arg = arr->elem + arr->elemsize * i;
+ ret = arr->get(buffer + off, &p);
+ if (ret < 0)
+ return ret;
+ off += ret;
+ }
+ buffer[off] = '\0';
+ return off;
+}
+
+int param_set_copystring(const char *val, struct kernel_param *kp)
+{
+ const struct kparam_string *kps = kp->str;
+
+ if (!val) {
+ printk(KERN_ERR "%s: missing param set value\n", kp->name);
+ return -EINVAL;
+ }
+ if (strlen(val)+1 > kps->maxlen) {
+ printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+ kp->name, kps->maxlen-1);
+ return -ENOSPC;
+ }
+ strcpy(kps->string, val);
+ return 0;
+}
+
+int param_get_string(char *buffer, struct kernel_param *kp)
+{
+ const struct kparam_string *kps = kp->str;
+ return strlcpy(buffer, kps->string, kps->maxlen);
+}
+
+/* sysfs output in /sys/modules/XYZ/parameters/ */
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+
+extern struct kernel_param __start___param[], __stop___param[];
+
+struct param_attribute
+{
+ struct module_attribute mattr;
+ struct kernel_param *param;
+};
+
+struct module_param_attrs
+{
+ unsigned int num;
+ struct attribute_group grp;
+ struct param_attribute attrs[0];
+};
+
+#ifdef CONFIG_SYSFS
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+
+static ssize_t param_attr_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ int count;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->get)
+ return -EPERM;
+
+ count = attribute->param->get(buf, attribute->param);
+ if (count > 0) {
+ strcat(buf, "\n");
+ ++count;
+ }
+ return count;
+}
+
+/* sysfs always hands a nul-terminated string in buf. We rely on that. */
+static ssize_t param_attr_store(struct module_attribute *mattr,
+ struct module *owner,
+ const char *buf, size_t len)
+{
+ int err;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->set)
+ return -EPERM;
+
+ err = attribute->param->set(buf, attribute->param);
+ if (!err)
+ return len;
+ return err;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+#define __modinit
+#else
+#define __modinit __init
+#endif
+
+#ifdef CONFIG_SYSFS
+/*
+ * add_sysfs_param - add a parameter to sysfs
+ * @mk: struct module_kobject
+ * @kparam: the actual parameter definition to add to sysfs
+ * @name: name of parameter
+ *
+ * Create a kobject if for a (per-module) parameter if mp NULL, and
+ * create file in sysfs. Returns an error on out of memory. Always cleans up
+ * if there's an error.
+ */
+static __modinit int add_sysfs_param(struct module_kobject *mk,
+ struct kernel_param *kp,
+ const char *name)
+{
+ struct module_param_attrs *new;
+ struct attribute **attrs;
+ int err, num;
+
+ /* We don't bother calling this with invisible parameters. */
+ BUG_ON(!kp->perm);
+
+ if (!mk->mp) {
+ num = 0;
+ attrs = NULL;
+ } else {
+ num = mk->mp->num;
+ attrs = mk->mp->grp.attrs;
+ }
+
+ /* Enlarge. */
+ new = krealloc(mk->mp,
+ sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
+ GFP_KERNEL);
+ if (!new) {
+ kfree(mk->mp);
+ err = -ENOMEM;
+ goto fail;
+ }
+ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
+ if (!attrs) {
+ err = -ENOMEM;
+ goto fail_free_new;
+ }
+
+ /* Sysfs wants everything zeroed. */
+ memset(new, 0, sizeof(*new));
+ memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
+ memset(&attrs[num], 0, sizeof(attrs[num]));
+ new->grp.name = "parameters";
+ new->grp.attrs = attrs;
+
+ /* Tack new one on the end. */
+ new->attrs[num].param = kp;
+ new->attrs[num].mattr.show = param_attr_show;
+ new->attrs[num].mattr.store = param_attr_store;
+ new->attrs[num].mattr.attr.name = (char *)name;
+ new->attrs[num].mattr.attr.mode = kp->perm;
+ new->num = num+1;
+
+ /* Fix up all the pointers, since krealloc can move us */
+ for (num = 0; num < new->num; num++)
+ new->grp.attrs[num] = &new->attrs[num].mattr.attr;
+ new->grp.attrs[num] = NULL;
+
+ mk->mp = new;
+ return 0;
+
+fail_free_new:
+ kfree(new);
+fail:
+ mk->mp = NULL;
+ return err;
+}
+
+#ifdef CONFIG_MODULES
+static void free_module_param_attrs(struct module_kobject *mk)
+{
+ kfree(mk->mp->grp.attrs);
+ kfree(mk->mp);
+ mk->mp = NULL;
+}
+
+/*
+ * module_param_sysfs_setup - setup sysfs support for one module
+ * @mod: module
+ * @kparam: module parameters (array)
+ * @num_params: number of module parameters
+ *
+ * Adds sysfs entries for module parameters under
+ * /sys/module/[mod->name]/parameters/
+ */
+int module_param_sysfs_setup(struct module *mod,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int i, err;
+ bool params = false;
+
+ for (i = 0; i < num_params; i++) {
+ if (kparam[i].perm == 0)
+ continue;
+ err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
+ if (err)
+ return err;
+ params = true;
+ }
+
+ if (!params)
+ return 0;
+
+ /* Create the param group. */
+ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ if (err)
+ free_module_param_attrs(&mod->mkobj);
+ return err;
+}
+
+/*
+ * module_param_sysfs_remove - remove sysfs support for one module
+ * @mod: module
+ *
+ * Remove sysfs entries for module parameters and the corresponding
+ * kobject.
+ */
+void module_param_sysfs_remove(struct module *mod)
+{
+ if (mod->mkobj.mp) {
+ sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ /* We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately. */
+ free_module_param_attrs(&mod->mkobj);
+ }
+}
+#endif
+
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+ struct kobject *kobj;
+ int err;
+
+ kobj = kset_find_obj(module_kset, name);
+ if (kobj) {
+ /* We already have one. Remove params so we can add more. */
+ mk = to_module_kobject(kobj);
+ /* We need to remove it before adding parameters. */
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+ } else {
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ BUG_ON(!mk);
+
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
+ "%s", name);
+ if (err) {
+ kobject_put(&mk->kobj);
+ printk(KERN_ERR "Module '%s' failed add to sysfs, "
+ "error number %d\n", name, err);
+ printk(KERN_ERR "The system will be unstable now.\n");
+ return;
+ }
+ /* So that exit path is even. */
+ kobject_get(&mk->kobj);
+ }
+
+ /* These should not fail at boot. */
+ err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
+ BUG_ON(err);
+ err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
+ BUG_ON(err);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+}
+
+/*
+ * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ *
+ * Add module_parameters to sysfs for "modules" built into the kernel.
+ *
+ * The "module" name (KBUILD_MODNAME) is stored before a dot, the
+ * "parameter" name is stored behind a dot in kernel_param->name. So,
+ * extract the "module" name for all built-in kernel_param-eters,
+ * and for all who have the same, call kernel_add_sysfs_param.
+ */
+static void __init param_sysfs_builtin(void)
+{
+ struct kernel_param *kp;
+ unsigned int name_len;
+ char modname[MODULE_NAME_LEN];
+
+ for (kp = __start___param; kp < __stop___param; kp++) {
+ char *dot;
+
+ if (kp->perm == 0)
+ continue;
+
+ dot = strchr(kp->name, '.');
+ if (!dot) {
+ /* This happens for core_param() */
+ strcpy(modname, "kernel");
+ name_len = 0;
+ } else {
+ name_len = dot - kp->name + 1;
+ strlcpy(modname, kp->name, name_len);
+ }
+ kernel_add_sysfs_param(modname, kp, name_len);
+ }
+}
+
+
+/* module-related sysfs stuff */
+
+static ssize_t module_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ ret = attribute->show(attribute, mk->mod, buf);
+
+ return ret;
+}
+
+static ssize_t module_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ ret = attribute->store(attribute, mk->mod, buf, len);
+
+ return ret;
+}
+
+static struct sysfs_ops module_sysfs_ops = {
+ .show = module_attr_show,
+ .store = module_attr_store,
+};
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+ struct kobj_type *ktype = get_ktype(kobj);
+
+ if (ktype == &module_ktype)
+ return 1;
+ return 0;
+}
+
+static struct kset_uevent_ops module_uevent_ops = {
+ .filter = uevent_filter,
+};
+
+struct kset *module_kset;
+int module_sysfs_initialized;
+
+struct kobj_type module_ktype = {
+ .sysfs_ops = &module_sysfs_ops,
+};
+
+/*
+ * param_sysfs_init - wrapper for built-in params support
+ */
+static int __init param_sysfs_init(void)
+{
+ module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
+ if (!module_kset) {
+ printk(KERN_WARNING "%s (%d): error creating kset\n",
+ __FILE__, __LINE__);
+ return -ENOMEM;
+ }
+ module_sysfs_initialized = 1;
+
+ param_sysfs_builtin();
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+#endif /* CONFIG_SYSFS */
+
+EXPORT_SYMBOL(param_set_byte);
+EXPORT_SYMBOL(param_get_byte);
+EXPORT_SYMBOL(param_set_short);
+EXPORT_SYMBOL(param_get_short);
+EXPORT_SYMBOL(param_set_ushort);
+EXPORT_SYMBOL(param_get_ushort);
+EXPORT_SYMBOL(param_set_int);
+EXPORT_SYMBOL(param_get_int);
+EXPORT_SYMBOL(param_set_uint);
+EXPORT_SYMBOL(param_get_uint);
+EXPORT_SYMBOL(param_set_long);
+EXPORT_SYMBOL(param_get_long);
+EXPORT_SYMBOL(param_set_ulong);
+EXPORT_SYMBOL(param_get_ulong);
+EXPORT_SYMBOL(param_set_charp);
+EXPORT_SYMBOL(param_get_charp);
+EXPORT_SYMBOL(param_set_bool);
+EXPORT_SYMBOL(param_get_bool);
+EXPORT_SYMBOL(param_set_invbool);
+EXPORT_SYMBOL(param_get_invbool);
+EXPORT_SYMBOL(param_array_set);
+EXPORT_SYMBOL(param_array_get);
+EXPORT_SYMBOL(param_set_copystring);
+EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/pid.c b/kernel/pid.c
new file mode 100644
index 0000000..064e76a
--- /dev/null
+++ b/kernel/pid.c
@@ -0,0 +1,530 @@
+/*
+ * Generic pidhash and scalable, time-bounded PID allocator
+ *
+ * (C) 2002-2003 William Irwin, IBM
+ * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2004 Ingo Molnar, Red Hat
+ *
+ * pid-structures are backing objects for tasks sharing a given ID to chain
+ * against. There is very little to them aside from hashing them and
+ * parking tasks using given ID's on a list.
+ *
+ * The hash is always changed with the tasklist_lock write-acquired,
+ * and the hash is only accessed with the tasklist_lock at least
+ * read-acquired, so there's no additional SMP locking needed here.
+ *
+ * We have a list of bitmap pages, which bitmaps represent the PID space.
+ * Allocating and freeing PIDs is completely lockless. The worst-case
+ * allocation scenario when all but one out of 1 million PIDs possible are
+ * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
+ * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ *
+ * Pid namespaces:
+ * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ * Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/pid_namespace.h>
+#include <linux/init_task.h>
+#include <linux/syscalls.h>
+
+#define pid_hashfn(nr, ns) \
+ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
+static struct hlist_head *pid_hash;
+static int pidhash_shift;
+struct pid init_struct_pid = INIT_STRUCT_PID;
+
+int pid_max = PID_MAX_DEFAULT;
+
+#define RESERVED_PIDS 300
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
+
+#define BITS_PER_PAGE (PAGE_SIZE*8)
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+
+static inline int mk_pid(struct pid_namespace *pid_ns,
+ struct pidmap *map, int off)
+{
+ return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
+}
+
+#define find_next_offset(map, off) \
+ find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/*
+ * PID-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low pid_max
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million PIDs, runtime.
+ */
+struct pid_namespace init_pid_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .pidmap = {
+ [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
+ },
+ .last_pid = 0,
+ .level = 0,
+ .child_reaper = &init_task,
+};
+EXPORT_SYMBOL_GPL(init_pid_ns);
+
+int is_container_init(struct task_struct *tsk)
+{
+ int ret = 0;
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = task_pid(tsk);
+ if (pid != NULL && pid->numbers[pid->level].nr == 1)
+ ret = 1;
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(is_container_init);
+
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+static void free_pidmap(struct upid *upid)
+{
+ int nr = upid->nr;
+ struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
+ int offset = nr & BITS_PER_PAGE_MASK;
+
+ clear_bit(offset, map->page);
+ atomic_inc(&map->nr_free);
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns)
+{
+ int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ struct pidmap *map;
+
+ pid = last + 1;
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+ max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+ for (i = 0; i <= max_scan; ++i) {
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (map->page)
+ kfree(page);
+ else
+ map->page = page;
+ spin_unlock_irq(&pidmap_lock);
+ if (unlikely(!map->page))
+ break;
+ }
+ if (likely(atomic_read(&map->nr_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ pid_ns->last_pid = pid;
+ return pid;
+ }
+ offset = find_next_offset(map, offset);
+ pid = mk_pid(pid_ns, map, offset);
+ /*
+ * find_next_offset() found a bit, the pid from it
+ * is in-bounds, and if we fell back to the last
+ * bitmap block and the final block was the same
+ * as the starting point, pid is before last_pid.
+ */
+ } while (offset < BITS_PER_PAGE && pid < pid_max &&
+ (i != max_scan || pid < last ||
+ !((last+1) & BITS_PER_PAGE_MASK)));
+ }
+ if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &pid_ns->pidmap[0];
+ offset = RESERVED_PIDS;
+ if (unlikely(last == offset))
+ break;
+ }
+ pid = mk_pid(pid_ns, map, offset);
+ }
+ return -1;
+}
+
+int next_pidmap(struct pid_namespace *pid_ns, int last)
+{
+ int offset;
+ struct pidmap *map, *end;
+
+ offset = (last + 1) & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+ end = &pid_ns->pidmap[PIDMAP_ENTRIES];
+ for (; map < end; map++, offset = 0) {
+ if (unlikely(!map->page))
+ continue;
+ offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+ if (offset < BITS_PER_PAGE)
+ return mk_pid(pid_ns, map, offset);
+ }
+ return -1;
+}
+
+void put_pid(struct pid *pid)
+{
+ struct pid_namespace *ns;
+
+ if (!pid)
+ return;
+
+ ns = pid->numbers[pid->level].ns;
+ if ((atomic_read(&pid->count) == 1) ||
+ atomic_dec_and_test(&pid->count)) {
+ kmem_cache_free(ns->pid_cachep, pid);
+ put_pid_ns(ns);
+ }
+}
+EXPORT_SYMBOL_GPL(put_pid);
+
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+ struct pid *pid = container_of(rhp, struct pid, rcu);
+ put_pid(pid);
+}
+
+void free_pid(struct pid *pid)
+{
+ /* We can be called with write_lock_irq(&tasklist_lock) held */
+ int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pidmap_lock, flags);
+ for (i = 0; i <= pid->level; i++)
+ hlist_del_rcu(&pid->numbers[i].pid_chain);
+ spin_unlock_irqrestore(&pidmap_lock, flags);
+
+ for (i = 0; i <= pid->level; i++)
+ free_pidmap(pid->numbers + i);
+
+ call_rcu(&pid->rcu, delayed_put_pid);
+}
+
+struct pid *alloc_pid(struct pid_namespace *ns)
+{
+ struct pid *pid;
+ enum pid_type type;
+ int i, nr;
+ struct pid_namespace *tmp;
+ struct upid *upid;
+
+ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+ if (!pid)
+ goto out;
+
+ tmp = ns;
+ for (i = ns->level; i >= 0; i--) {
+ nr = alloc_pidmap(tmp);
+ if (nr < 0)
+ goto out_free;
+
+ pid->numbers[i].nr = nr;
+ pid->numbers[i].ns = tmp;
+ tmp = tmp->parent;
+ }
+
+ get_pid_ns(ns);
+ pid->level = ns->level;
+ atomic_set(&pid->count, 1);
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+
+ spin_lock_irq(&pidmap_lock);
+ for (i = ns->level; i >= 0; i--) {
+ upid = &pid->numbers[i];
+ hlist_add_head_rcu(&upid->pid_chain,
+ &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ }
+ spin_unlock_irq(&pidmap_lock);
+
+out:
+ return pid;
+
+out_free:
+ while (++i <= ns->level)
+ free_pidmap(pid->numbers + i);
+
+ kmem_cache_free(ns->pid_cachep, pid);
+ pid = NULL;
+ goto out;
+}
+
+struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
+{
+ struct hlist_node *elem;
+ struct upid *pnr;
+
+ hlist_for_each_entry_rcu(pnr, elem,
+ &pid_hash[pid_hashfn(nr, ns)], pid_chain)
+ if (pnr->nr == nr && pnr->ns == ns)
+ return container_of(pnr, struct pid,
+ numbers[ns->level]);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(find_pid_ns);
+
+struct pid *find_vpid(int nr)
+{
+ return find_pid_ns(nr, current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL_GPL(find_vpid);
+
+/*
+ * attach_pid() must be called with the tasklist_lock write-held.
+ */
+void attach_pid(struct task_struct *task, enum pid_type type,
+ struct pid *pid)
+{
+ struct pid_link *link;
+
+ link = &task->pids[type];
+ link->pid = pid;
+ hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+}
+
+static void __change_pid(struct task_struct *task, enum pid_type type,
+ struct pid *new)
+{
+ struct pid_link *link;
+ struct pid *pid;
+ int tmp;
+
+ link = &task->pids[type];
+ pid = link->pid;
+
+ hlist_del_rcu(&link->node);
+ link->pid = new;
+
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (!hlist_empty(&pid->tasks[tmp]))
+ return;
+
+ free_pid(pid);
+}
+
+void detach_pid(struct task_struct *task, enum pid_type type)
+{
+ __change_pid(task, type, NULL);
+}
+
+void change_pid(struct task_struct *task, enum pid_type type,
+ struct pid *pid)
+{
+ __change_pid(task, type, pid);
+ attach_pid(task, type, pid);
+}
+
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void transfer_pid(struct task_struct *old, struct task_struct *new,
+ enum pid_type type)
+{
+ new->pids[type].pid = old->pids[type].pid;
+ hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+}
+
+struct task_struct *pid_task(struct pid *pid, enum pid_type type)
+{
+ struct task_struct *result = NULL;
+ if (pid) {
+ struct hlist_node *first;
+ first = rcu_dereference(pid->tasks[type].first);
+ if (first)
+ result = hlist_entry(first, struct task_struct, pids[(type)].node);
+ }
+ return result;
+}
+EXPORT_SYMBOL(pid_task);
+
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+struct task_struct *find_task_by_pid_type_ns(int type, int nr,
+ struct pid_namespace *ns)
+{
+ return pid_task(find_pid_ns(nr, ns), type);
+}
+
+EXPORT_SYMBOL(find_task_by_pid_type_ns);
+
+struct task_struct *find_task_by_vpid(pid_t vnr)
+{
+ return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
+ current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL(find_task_by_vpid);
+
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
+{
+ return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
+}
+EXPORT_SYMBOL(find_task_by_pid_ns);
+
+struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid;
+ rcu_read_lock();
+ pid = get_pid(task->pids[type].pid);
+ rcu_read_unlock();
+ return pid;
+}
+
+struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
+{
+ struct task_struct *result;
+ rcu_read_lock();
+ result = pid_task(pid, type);
+ if (result)
+ get_task_struct(result);
+ rcu_read_unlock();
+ return result;
+}
+
+struct pid *find_get_pid(pid_t nr)
+{
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = get_pid(find_vpid(nr));
+ rcu_read_unlock();
+
+ return pid;
+}
+EXPORT_SYMBOL_GPL(find_get_pid);
+
+pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+{
+ struct upid *upid;
+ pid_t nr = 0;
+
+ if (pid && ns->level <= pid->level) {
+ upid = &pid->numbers[ns->level];
+ if (upid->ns == ns)
+ nr = upid->nr;
+ }
+ return nr;
+}
+
+pid_t pid_vnr(struct pid *pid)
+{
+ return pid_nr_ns(pid, current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL_GPL(pid_vnr);
+
+pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+ return pid_nr_ns(task_pid(tsk), ns);
+}
+EXPORT_SYMBOL(task_pid_nr_ns);
+
+pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+ return pid_nr_ns(task_tgid(tsk), ns);
+}
+EXPORT_SYMBOL(task_tgid_nr_ns);
+
+pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+ return pid_nr_ns(task_pgrp(tsk), ns);
+}
+EXPORT_SYMBOL(task_pgrp_nr_ns);
+
+pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+ return pid_nr_ns(task_session(tsk), ns);
+}
+EXPORT_SYMBOL(task_session_nr_ns);
+
+/*
+ * Used by proc to find the first pid that is greater then or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
+ */
+struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
+{
+ struct pid *pid;
+
+ do {
+ pid = find_pid_ns(nr, ns);
+ if (pid)
+ break;
+ nr = next_pidmap(ns, nr);
+ } while (nr > 0);
+
+ return pid;
+}
+
+/*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
+void __init pidhash_init(void)
+{
+ int i, pidhash_size;
+ unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
+
+ pidhash_shift = max(4, fls(megabytes * 4));
+ pidhash_shift = min(12, pidhash_shift);
+ pidhash_size = 1 << pidhash_shift;
+
+ printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
+ pidhash_size, pidhash_shift,
+ pidhash_size * sizeof(struct hlist_head));
+
+ pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
+ if (!pid_hash)
+ panic("Could not alloc pidhash!\n");
+ for (i = 0; i < pidhash_size; i++)
+ INIT_HLIST_HEAD(&pid_hash[i]);
+}
+
+void __init pidmap_init(void)
+{
+ init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /* Reserve PID 0. We never call free_pidmap(0) */
+ set_bit(0, init_pid_ns.pidmap[0].page);
+ atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+
+ init_pid_ns.pid_cachep = KMEM_CACHE(pid,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 0000000..fab8ea8
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,192 @@
+/*
+ * Pid namespaces
+ *
+ * Authors:
+ * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ * Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+#include <linux/acct.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE*8)
+
+struct pid_cache {
+ int nr_ids;
+ char name[16];
+ struct kmem_cache *cachep;
+ struct list_head list;
+};
+
+static LIST_HEAD(pid_caches_lh);
+static DEFINE_MUTEX(pid_caches_mutex);
+static struct kmem_cache *pid_ns_cachep;
+
+/*
+ * creates the kmem cache to allocate pids from.
+ * @nr_ids: the number of numerical ids this pid will have to carry
+ */
+
+static struct kmem_cache *create_pid_cachep(int nr_ids)
+{
+ struct pid_cache *pcache;
+ struct kmem_cache *cachep;
+
+ mutex_lock(&pid_caches_mutex);
+ list_for_each_entry(pcache, &pid_caches_lh, list)
+ if (pcache->nr_ids == nr_ids)
+ goto out;
+
+ pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
+ if (pcache == NULL)
+ goto err_alloc;
+
+ snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
+ cachep = kmem_cache_create(pcache->name,
+ sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (cachep == NULL)
+ goto err_cachep;
+
+ pcache->nr_ids = nr_ids;
+ pcache->cachep = cachep;
+ list_add(&pcache->list, &pid_caches_lh);
+out:
+ mutex_unlock(&pid_caches_mutex);
+ return pcache->cachep;
+
+err_cachep:
+ kfree(pcache);
+err_alloc:
+ mutex_unlock(&pid_caches_mutex);
+ return NULL;
+}
+
+static struct pid_namespace *create_pid_namespace(unsigned int level)
+{
+ struct pid_namespace *ns;
+ int i;
+
+ ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
+ if (ns == NULL)
+ goto out;
+
+ ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ns->pidmap[0].page)
+ goto out_free;
+
+ ns->pid_cachep = create_pid_cachep(level + 1);
+ if (ns->pid_cachep == NULL)
+ goto out_free_map;
+
+ kref_init(&ns->kref);
+ ns->level = level;
+
+ set_bit(0, ns->pidmap[0].page);
+ atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+
+ for (i = 1; i < PIDMAP_ENTRIES; i++)
+ atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+
+ return ns;
+
+out_free_map:
+ kfree(ns->pidmap[0].page);
+out_free:
+ kmem_cache_free(pid_ns_cachep, ns);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+ int i;
+
+ for (i = 0; i < PIDMAP_ENTRIES; i++)
+ kfree(ns->pidmap[i].page);
+ kmem_cache_free(pid_ns_cachep, ns);
+}
+
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+{
+ struct pid_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ new_ns = get_pid_ns(old_ns);
+ if (!(flags & CLONE_NEWPID))
+ goto out;
+
+ new_ns = ERR_PTR(-EINVAL);
+ if (flags & CLONE_THREAD)
+ goto out_put;
+
+ new_ns = create_pid_namespace(old_ns->level + 1);
+ if (!IS_ERR(new_ns))
+ new_ns->parent = get_pid_ns(old_ns);
+
+out_put:
+ put_pid_ns(old_ns);
+out:
+ return new_ns;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+ struct pid_namespace *ns, *parent;
+
+ ns = container_of(kref, struct pid_namespace, kref);
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+
+ if (parent != NULL)
+ put_pid_ns(parent);
+}
+
+void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+{
+ int nr;
+ int rc;
+
+ /*
+ * The last thread in the cgroup-init thread group is terminating.
+ * Find remaining pid_ts in the namespace, signal and wait for them
+ * to exit.
+ *
+ * Note: This signals each threads in the namespace - even those that
+ * belong to the same thread group, To avoid this, we would have
+ * to walk the entire tasklist looking a processes in this
+ * namespace, but that could be unnecessarily expensive if the
+ * pid namespace has just a few processes. Or we need to
+ * maintain a tasklist for each pid namespace.
+ *
+ */
+ read_lock(&tasklist_lock);
+ nr = next_pidmap(pid_ns, 1);
+ while (nr > 0) {
+ kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+ nr = next_pidmap(pid_ns, nr);
+ }
+ read_unlock(&tasklist_lock);
+
+ do {
+ clear_thread_flag(TIF_SIGPENDING);
+ rc = sys_wait4(-1, NULL, __WALL, NULL);
+ } while (rc != -ECHILD);
+
+ acct_exit_ns(pid_ns);
+ return;
+}
+
+static __init int pid_namespaces_init(void)
+{
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ return 0;
+}
+
+__initcall(pid_namespaces_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
new file mode 100644
index 0000000..dfdec52
--- /dev/null
+++ b/kernel/pm_qos_params.c
@@ -0,0 +1,423 @@
+/*
+ * This module exposes the interface to kernel space for specifying
+ * QoS dependencies. It provides infrastructure for registration of:
+ *
+ * Dependents on a QoS value : register requirements
+ * Watchers of QoS value : get notified when target QoS value changes
+ *
+ * This QoS design is best effort based. Dependents register their QoS needs.
+ * Watchers register to keep track of the current QoS needs of the system.
+ *
+ * There are 3 basic classes of QoS parameter: latency, timeout, throughput
+ * each have defined units:
+ * latency: usec
+ * timeout: usec <-- currently not used.
+ * throughput: kbs (kilo byte / sec)
+ *
+ * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ *
+ * User mode requirements on a QOS parameter register themselves to the
+ * subsystem by opening the device node /dev/... and writing there request to
+ * the node. As long as the process holds a file handle open to the node the
+ * client continues to be accounted for. Upon file release the usermode
+ * requirement is removed and a new qos target is computed. This way when the
+ * requirement that the application has is cleaned up when closes the file
+ * pointer or exits the pm_qos_object will get an opportunity to clean up.
+ *
+ * Mark Gross <mgross@linux.intel.com>
+ */
+
+#include <linux/pm_qos_params.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+
+#include <linux/uaccess.h>
+
+/*
+ * locking rule: all changes to requirements or notifiers lists
+ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
+ * held, taken with _irqsave. One lock to rule them all
+ */
+struct requirement_list {
+ struct list_head list;
+ union {
+ s32 value;
+ s32 usec;
+ s32 kbps;
+ };
+ char *name;
+};
+
+static s32 max_compare(s32 v1, s32 v2);
+static s32 min_compare(s32 v1, s32 v2);
+
+struct pm_qos_object {
+ struct requirement_list requirements;
+ struct blocking_notifier_head *notifiers;
+ struct miscdevice pm_qos_power_miscdev;
+ char *name;
+ s32 default_value;
+ atomic_t target_value;
+ s32 (*comparitor)(s32, s32);
+};
+
+static struct pm_qos_object null_pm_qos;
+static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
+static struct pm_qos_object cpu_dma_pm_qos = {
+ .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+ .notifiers = &cpu_dma_lat_notifier,
+ .name = "cpu_dma_latency",
+ .default_value = 2000 * USEC_PER_SEC,
+ .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
+ .comparitor = min_compare
+};
+
+static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
+static struct pm_qos_object network_lat_pm_qos = {
+ .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+ .notifiers = &network_lat_notifier,
+ .name = "network_latency",
+ .default_value = 2000 * USEC_PER_SEC,
+ .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
+ .comparitor = min_compare
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
+static struct pm_qos_object network_throughput_pm_qos = {
+ .requirements =
+ {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
+ .notifiers = &network_throughput_notifier,
+ .name = "network_throughput",
+ .default_value = 0,
+ .target_value = ATOMIC_INIT(0),
+ .comparitor = max_compare
+};
+
+
+static struct pm_qos_object *pm_qos_array[] = {
+ &null_pm_qos,
+ &cpu_dma_pm_qos,
+ &network_lat_pm_qos,
+ &network_throughput_pm_qos
+};
+
+static DEFINE_SPINLOCK(pm_qos_lock);
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos);
+static int pm_qos_power_open(struct inode *inode, struct file *filp);
+static int pm_qos_power_release(struct inode *inode, struct file *filp);
+
+static const struct file_operations pm_qos_power_fops = {
+ .write = pm_qos_power_write,
+ .open = pm_qos_power_open,
+ .release = pm_qos_power_release,
+};
+
+/* static helper functions */
+static s32 max_compare(s32 v1, s32 v2)
+{
+ return max(v1, v2);
+}
+
+static s32 min_compare(s32 v1, s32 v2)
+{
+ return min(v1, v2);
+}
+
+
+static void update_target(int target)
+{
+ s32 extreme_value;
+ struct requirement_list *node;
+ unsigned long flags;
+ int call_notifier = 0;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ extreme_value = pm_qos_array[target]->default_value;
+ list_for_each_entry(node,
+ &pm_qos_array[target]->requirements.list, list) {
+ extreme_value = pm_qos_array[target]->comparitor(
+ extreme_value, node->value);
+ }
+ if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
+ call_notifier = 1;
+ atomic_set(&pm_qos_array[target]->target_value, extreme_value);
+ pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
+ atomic_read(&pm_qos_array[target]->target_value));
+ }
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ if (call_notifier)
+ blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
+ (unsigned long) extreme_value, NULL);
+}
+
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+ qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+ qos->pm_qos_power_miscdev.name = qos->name;
+ qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+ return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+ int pm_qos_class;
+
+ for (pm_qos_class = 0;
+ pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+ if (minor ==
+ pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+ return pm_qos_class;
+ }
+ return -1;
+}
+
+/**
+ * pm_qos_requirement - returns current system wide qos expectation
+ * @pm_qos_class: identification of which qos value is requested
+ *
+ * This function returns the current target value in an atomic manner.
+ */
+int pm_qos_requirement(int pm_qos_class)
+{
+ return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+}
+EXPORT_SYMBOL_GPL(pm_qos_requirement);
+
+/**
+ * pm_qos_add_requirement - inserts new qos request into the list
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * This function inserts a new entry in the pm_qos_class list of requested qos
+ * performance characteristics. It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters.
+ */
+int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+{
+ struct requirement_list *dep;
+ unsigned long flags;
+
+ dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
+ if (dep) {
+ if (value == PM_QOS_DEFAULT_VALUE)
+ dep->value = pm_qos_array[pm_qos_class]->default_value;
+ else
+ dep->value = value;
+ dep->name = kstrdup(name, GFP_KERNEL);
+ if (!dep->name)
+ goto cleanup;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ list_add(&dep->list,
+ &pm_qos_array[pm_qos_class]->requirements.list);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ update_target(pm_qos_class);
+
+ return 0;
+ }
+
+cleanup:
+ kfree(dep);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+
+/**
+ * pm_qos_update_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * If the named request isn't in the list then no change is made.
+ */
+int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+{
+ unsigned long flags;
+ struct requirement_list *node;
+ int pending_update = 0;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ list_for_each_entry(node,
+ &pm_qos_array[pm_qos_class]->requirements.list, list) {
+ if (strcmp(node->name, name) == 0) {
+ if (new_value == PM_QOS_DEFAULT_VALUE)
+ node->value =
+ pm_qos_array[pm_qos_class]->default_value;
+ else
+ node->value = new_value;
+ pending_update = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ if (pending_update)
+ update_target(pm_qos_class);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+
+/**
+ * pm_qos_remove_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ *
+ * Will remove named qos request from pm_qos_class list of parameters and
+ * recompute the current target value for the pm_qos_class.
+ */
+void pm_qos_remove_requirement(int pm_qos_class, char *name)
+{
+ unsigned long flags;
+ struct requirement_list *node;
+ int pending_update = 0;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ list_for_each_entry(node,
+ &pm_qos_array[pm_qos_class]->requirements.list, list) {
+ if (strcmp(node->name, name) == 0) {
+ kfree(node->name);
+ list_del(&node->list);
+ kfree(node);
+ pending_update = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ if (pending_update)
+ update_target(pm_qos_class);
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+
+/**
+ * pm_qos_add_notifier - sets notification entry for changes to target value
+ * @pm_qos_class: identifies which qos target changes should be notified.
+ * @notifier: notifier block managed by caller.
+ *
+ * will register the notifier into a notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+ int retval;
+
+ retval = blocking_notifier_chain_register(
+ pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
+
+/**
+ * pm_qos_remove_notifier - deletes notification entry from chain.
+ * @pm_qos_class: identifies which qos target changes are notified.
+ * @notifier: notifier block to be removed.
+ *
+ * will remove the notifier from the notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+ int retval;
+
+ retval = blocking_notifier_chain_unregister(
+ pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+
+#define PID_NAME_LEN sizeof("process_1234567890")
+static char name[PID_NAME_LEN];
+
+static int pm_qos_power_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+ long pm_qos_class;
+
+ lock_kernel();
+ pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
+ if (pm_qos_class >= 0) {
+ filp->private_data = (void *)pm_qos_class;
+ sprintf(name, "process_%d", current->pid);
+ ret = pm_qos_add_requirement(pm_qos_class, name,
+ PM_QOS_DEFAULT_VALUE);
+ if (ret >= 0) {
+ unlock_kernel();
+ return 0;
+ }
+ }
+ unlock_kernel();
+
+ return -EPERM;
+}
+
+static int pm_qos_power_release(struct inode *inode, struct file *filp)
+{
+ int pm_qos_class;
+
+ pm_qos_class = (long)filp->private_data;
+ sprintf(name, "process_%d", current->pid);
+ pm_qos_remove_requirement(pm_qos_class, name);
+
+ return 0;
+}
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value;
+ int pm_qos_class;
+
+ pm_qos_class = (long)filp->private_data;
+ if (count != sizeof(s32))
+ return -EINVAL;
+ if (copy_from_user(&value, buf, sizeof(s32)))
+ return -EFAULT;
+ sprintf(name, "process_%d", current->pid);
+ pm_qos_update_requirement(pm_qos_class, name, value);
+
+ return sizeof(s32);
+}
+
+
+static int __init pm_qos_power_init(void)
+{
+ int ret = 0;
+
+ ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+ if (ret < 0) {
+ printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+ return ret;
+ }
+ ret = register_pm_qos_misc(&network_lat_pm_qos);
+ if (ret < 0) {
+ printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+ return ret;
+ }
+ ret = register_pm_qos_misc(&network_throughput_pm_qos);
+ if (ret < 0)
+ printk(KERN_ERR
+ "pm_qos_param: network_throughput setup failed\n");
+
+ return ret;
+}
+
+late_initcall(pm_qos_power_init);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 0000000..4e5288a
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,1672 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <linux/errno.h>
+#include <linux/math64.h>
+#include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
+
+/*
+ * Allocate the thread_group_cputime structure appropriately and fill in the
+ * current values of the fields. Called from copy_signal() via
+ * thread_group_cputime_clone_thread() when adding a second or subsequent
+ * thread to a thread group. Assumes interrupts are enabled when called.
+ */
+int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct task_cputime *cputime;
+
+ /*
+ * If we have multiple threads and we don't already have a
+ * per-CPU task_cputime struct (checked in the caller), allocate
+ * one and fill it in with the times accumulated so far. We may
+ * race with another thread so recheck after we pick up the sighand
+ * lock.
+ */
+ cputime = alloc_percpu(struct task_cputime);
+ if (cputime == NULL)
+ return -ENOMEM;
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (sig->cputime.totals) {
+ spin_unlock_irq(&tsk->sighand->siglock);
+ free_percpu(cputime);
+ return 0;
+ }
+ sig->cputime.totals = cputime;
+ cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
+ cputime->utime = tsk->utime;
+ cputime->stime = tsk->stime;
+ cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ return 0;
+}
+
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * @tsk: The task we use to identify the thread group.
+ * @times: task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime(
+ struct task_struct *tsk,
+ struct task_cputime *times)
+{
+ struct signal_struct *sig;
+ int i;
+ struct task_cputime *tot;
+
+ sig = tsk->signal;
+ if (unlikely(!sig) || !sig->cputime.totals) {
+ times->utime = tsk->utime;
+ times->stime = tsk->stime;
+ times->sum_exec_runtime = tsk->se.sum_exec_runtime;
+ return;
+ }
+ times->stime = times->utime = cputime_zero;
+ times->sum_exec_runtime = 0;
+ for_each_possible_cpu(i) {
+ tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+ times->utime = cputime_add(times->utime, tot->utime);
+ times->stime = cputime_add(times->stime, tot->stime);
+ times->sum_exec_runtime += tot->sum_exec_runtime;
+ }
+}
+
+/*
+ * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ */
+void update_rlimit_cpu(unsigned long rlim_new)
+{
+ cputime_t cputime;
+
+ cputime = secs_to_cputime(rlim_new);
+ if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+ cputime_lt(current->signal->it_prof_expires, cputime)) {
+ spin_lock_irq(&current->sighand->siglock);
+ set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+ spin_unlock_irq(&current->sighand->siglock);
+ }
+}
+
+static int check_clock(const clockid_t which_clock)
+{
+ int error = 0;
+ struct task_struct *p;
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+
+ if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ if (pid == 0)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_vpid(pid);
+ if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
+ same_thread_group(p, current) : thread_group_leader(p))) {
+ error = -EINVAL;
+ }
+ read_unlock(&tasklist_lock);
+
+ return error;
+}
+
+static inline union cpu_time_count
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
+{
+ union cpu_time_count ret;
+ ret.sched = 0; /* high half always zero when .cpu used */
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+ } else {
+ ret.cpu = timespec_to_cputime(tp);
+ }
+ return ret;
+}
+
+static void sample_to_timespec(const clockid_t which_clock,
+ union cpu_time_count cpu,
+ struct timespec *tp)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
+ *tp = ns_to_timespec(cpu.sched);
+ else
+ cputime_to_timespec(cpu.cpu, tp);
+}
+
+static inline int cpu_time_before(const clockid_t which_clock,
+ union cpu_time_count now,
+ union cpu_time_count then)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ return now.sched < then.sched;
+ } else {
+ return cputime_lt(now.cpu, then.cpu);
+ }
+}
+static inline void cpu_time_add(const clockid_t which_clock,
+ union cpu_time_count *acc,
+ union cpu_time_count val)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ acc->sched += val.sched;
+ } else {
+ acc->cpu = cputime_add(acc->cpu, val.cpu);
+ }
+}
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
+ union cpu_time_count a,
+ union cpu_time_count b)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ a.sched -= b.sched;
+ } else {
+ a.cpu = cputime_sub(a.cpu, b.cpu);
+ }
+ return a;
+}
+
+/*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+ cputime_t res = cputime_div(time, div);
+
+ return max_t(cputime_t, res, 1);
+}
+
+/*
+ * Update expiry time from increment, and increase overrun count,
+ * given the current clock sample.
+ */
+static void bump_cpu_timer(struct k_itimer *timer,
+ union cpu_time_count now)
+{
+ int i;
+
+ if (timer->it.cpu.incr.sched == 0)
+ return;
+
+ if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+ unsigned long long delta, incr;
+
+ if (now.sched < timer->it.cpu.expires.sched)
+ return;
+ incr = timer->it.cpu.incr.sched;
+ delta = now.sched + incr - timer->it.cpu.expires.sched;
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; incr < delta - incr; i++)
+ incr = incr << 1;
+ for (; i >= 0; incr >>= 1, i--) {
+ if (delta < incr)
+ continue;
+ timer->it.cpu.expires.sched += incr;
+ timer->it_overrun += 1 << i;
+ delta -= incr;
+ }
+ } else {
+ cputime_t delta, incr;
+
+ if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+ return;
+ incr = timer->it.cpu.incr.cpu;
+ delta = cputime_sub(cputime_add(now.cpu, incr),
+ timer->it.cpu.expires.cpu);
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+ incr = cputime_add(incr, incr);
+ for (; i >= 0; incr = cputime_halve(incr), i--) {
+ if (cputime_lt(delta, incr))
+ continue;
+ timer->it.cpu.expires.cpu =
+ cputime_add(timer->it.cpu.expires.cpu, incr);
+ timer->it_overrun += 1 << i;
+ delta = cputime_sub(delta, incr);
+ }
+ }
+}
+
+static inline cputime_t prof_ticks(struct task_struct *p)
+{
+ return cputime_add(p->utime, p->stime);
+}
+static inline cputime_t virt_ticks(struct task_struct *p)
+{
+ return p->utime;
+}
+
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+ int error = check_clock(which_clock);
+ if (!error) {
+ tp->tv_sec = 0;
+ tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ /*
+ * If sched_clock is using a cycle counter, we
+ * don't have any idea of its true resolution
+ * exported, but it is much more than 1s/HZ.
+ */
+ tp->tv_nsec = 1;
+ }
+ }
+ return error;
+}
+
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+{
+ /*
+ * You can never reset a CPU clock, but we check for other errors
+ * in the call before failing with EPERM.
+ */
+ int error = check_clock(which_clock);
+ if (error == 0) {
+ error = -EPERM;
+ }
+ return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = prof_ticks(p);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = virt_ticks(p);
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(const clockid_t which_clock,
+ struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = cputime.utime;
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ break;
+ }
+ return 0;
+}
+
+
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+{
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+ int error = -EINVAL;
+ union cpu_time_count rtn;
+
+ if (pid == 0) {
+ /*
+ * Special case constant value for our own clocks.
+ * We don't have to do any lookup to find ourselves.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ /*
+ * Sampling just ourselves we can do with no locking.
+ */
+ error = cpu_clock_sample(which_clock,
+ current, &rtn);
+ } else {
+ read_lock(&tasklist_lock);
+ error = cpu_clock_sample_group(which_clock,
+ current, &rtn);
+ read_unlock(&tasklist_lock);
+ }
+ } else {
+ /*
+ * Find the given PID, and validate that the caller
+ * should be able to see it.
+ */
+ struct task_struct *p;
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (p) {
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (same_thread_group(p, current)) {
+ error = cpu_clock_sample(which_clock,
+ p, &rtn);
+ }
+ } else {
+ read_lock(&tasklist_lock);
+ if (thread_group_leader(p) && p->signal) {
+ error =
+ cpu_clock_sample_group(which_clock,
+ p, &rtn);
+ }
+ read_unlock(&tasklist_lock);
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (error)
+ return error;
+ sample_to_timespec(which_clock, rtn, tp);
+ return 0;
+}
+
+
+/*
+ * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
+ * This is called from sys_timer_create with the new timer already locked.
+ */
+int posix_cpu_timer_create(struct k_itimer *new_timer)
+{
+ int ret = 0;
+ const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
+ struct task_struct *p;
+
+ if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+ new_timer->it.cpu.incr.sched = 0;
+ new_timer->it.cpu.expires.sched = 0;
+
+ read_lock(&tasklist_lock);
+ if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
+ if (pid == 0) {
+ p = current;
+ } else {
+ p = find_task_by_vpid(pid);
+ if (p && !same_thread_group(p, current))
+ p = NULL;
+ }
+ } else {
+ if (pid == 0) {
+ p = current->group_leader;
+ } else {
+ p = find_task_by_vpid(pid);
+ if (p && !thread_group_leader(p))
+ p = NULL;
+ }
+ }
+ new_timer->it.cpu.task = p;
+ if (p) {
+ get_task_struct(p);
+ } else {
+ ret = -EINVAL;
+ }
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * Clean up a CPU-clock timer that is about to be destroyed.
+ * This is called from timer deletion with the timer already locked.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_del(struct k_itimer *timer)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ int ret = 0;
+
+ if (likely(p != NULL)) {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * We raced with the reaping of the task.
+ * The deletion should have cleared us off the list.
+ */
+ BUG_ON(!list_empty(&timer->it.cpu.entry));
+ } else {
+ spin_lock(&p->sighand->siglock);
+ if (timer->it.cpu.firing)
+ ret = TIMER_RETRY;
+ else
+ list_del(&timer->it.cpu.entry);
+ spin_unlock(&p->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+
+ if (!ret)
+ put_task_struct(p);
+ }
+
+ return ret;
+}
+
+/*
+ * Clean out CPU timers still ticking when a thread exited. The task
+ * pointer is cleared, and the expiry time is replaced with the residual
+ * time for later timer_gettime calls to return.
+ * This must be called with the siglock held.
+ */
+static void cleanup_timers(struct list_head *head,
+ cputime_t utime, cputime_t stime,
+ unsigned long long sum_exec_runtime)
+{
+ struct cpu_timer_list *timer, *next;
+ cputime_t ptime = cputime_add(utime, stime);
+
+ list_for_each_entry_safe(timer, next, head, entry) {
+ list_del_init(&timer->entry);
+ if (cputime_lt(timer->expires.cpu, ptime)) {
+ timer->expires.cpu = cputime_zero;
+ } else {
+ timer->expires.cpu = cputime_sub(timer->expires.cpu,
+ ptime);
+ }
+ }
+
+ ++head;
+ list_for_each_entry_safe(timer, next, head, entry) {
+ list_del_init(&timer->entry);
+ if (cputime_lt(timer->expires.cpu, utime)) {
+ timer->expires.cpu = cputime_zero;
+ } else {
+ timer->expires.cpu = cputime_sub(timer->expires.cpu,
+ utime);
+ }
+ }
+
+ ++head;
+ list_for_each_entry_safe(timer, next, head, entry) {
+ list_del_init(&timer->entry);
+ if (timer->expires.sched < sum_exec_runtime) {
+ timer->expires.sched = 0;
+ } else {
+ timer->expires.sched -= sum_exec_runtime;
+ }
+ }
+}
+
+/*
+ * These are both called with the siglock held, when the current thread
+ * is being reaped. When the final (leader) thread in the group is reaped,
+ * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
+ */
+void posix_cpu_timers_exit(struct task_struct *tsk)
+{
+ cleanup_timers(tsk->cpu_timers,
+ tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+
+}
+void posix_cpu_timers_exit_group(struct task_struct *tsk)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(tsk, &cputime);
+ cleanup_timers(tsk->signal->cpu_timers,
+ cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+}
+
+static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+{
+ /*
+ * That's all for this thread or process.
+ * We leave our residual in expires to be reported.
+ */
+ put_task_struct(timer->it.cpu.task);
+ timer->it.cpu.task = NULL;
+ timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
+ timer->it.cpu.expires,
+ now);
+}
+
+/*
+ * Insert the timer on the appropriate list before any timers that
+ * expire later. This must be called with the tasklist_lock held
+ * for reading, and interrupts disabled.
+ */
+static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ struct list_head *head, *listpos;
+ struct cpu_timer_list *const nt = &timer->it.cpu;
+ struct cpu_timer_list *next;
+ unsigned long i;
+
+ head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
+ p->cpu_timers : p->signal->cpu_timers);
+ head += CPUCLOCK_WHICH(timer->it_clock);
+
+ BUG_ON(!irqs_disabled());
+ spin_lock(&p->sighand->siglock);
+
+ listpos = head;
+ if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+ list_for_each_entry(next, head, entry) {
+ if (next->expires.sched > nt->expires.sched)
+ break;
+ listpos = &next->entry;
+ }
+ } else {
+ list_for_each_entry(next, head, entry) {
+ if (cputime_gt(next->expires.cpu, nt->expires.cpu))
+ break;
+ listpos = &next->entry;
+ }
+ }
+ list_add(&nt->entry, listpos);
+
+ if (listpos == head) {
+ /*
+ * We are the new earliest-expiring timer.
+ * If we are a thread timer, there can always
+ * be a process timer telling us to stop earlier.
+ */
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ switch (CPUCLOCK_WHICH(timer->it_clock)) {
+ default:
+ BUG();
+ case CPUCLOCK_PROF:
+ if (cputime_eq(p->cputime_expires.prof_exp,
+ cputime_zero) ||
+ cputime_gt(p->cputime_expires.prof_exp,
+ nt->expires.cpu))
+ p->cputime_expires.prof_exp =
+ nt->expires.cpu;
+ break;
+ case CPUCLOCK_VIRT:
+ if (cputime_eq(p->cputime_expires.virt_exp,
+ cputime_zero) ||
+ cputime_gt(p->cputime_expires.virt_exp,
+ nt->expires.cpu))
+ p->cputime_expires.virt_exp =
+ nt->expires.cpu;
+ break;
+ case CPUCLOCK_SCHED:
+ if (p->cputime_expires.sched_exp == 0 ||
+ p->cputime_expires.sched_exp >
+ nt->expires.sched)
+ p->cputime_expires.sched_exp =
+ nt->expires.sched;
+ break;
+ }
+ } else {
+ /*
+ * For a process timer, set the cached expiration time.
+ */
+ switch (CPUCLOCK_WHICH(timer->it_clock)) {
+ default:
+ BUG();
+ case CPUCLOCK_VIRT:
+ if (!cputime_eq(p->signal->it_virt_expires,
+ cputime_zero) &&
+ cputime_lt(p->signal->it_virt_expires,
+ timer->it.cpu.expires.cpu))
+ break;
+ p->signal->cputime_expires.virt_exp =
+ timer->it.cpu.expires.cpu;
+ break;
+ case CPUCLOCK_PROF:
+ if (!cputime_eq(p->signal->it_prof_expires,
+ cputime_zero) &&
+ cputime_lt(p->signal->it_prof_expires,
+ timer->it.cpu.expires.cpu))
+ break;
+ i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
+ if (i != RLIM_INFINITY &&
+ i <= cputime_to_secs(timer->it.cpu.expires.cpu))
+ break;
+ p->signal->cputime_expires.prof_exp =
+ timer->it.cpu.expires.cpu;
+ break;
+ case CPUCLOCK_SCHED:
+ p->signal->cputime_expires.sched_exp =
+ timer->it.cpu.expires.sched;
+ break;
+ }
+ }
+ }
+
+ spin_unlock(&p->sighand->siglock);
+}
+
+/*
+ * The timer is locked, fire it and arrange for its reload.
+ */
+static void cpu_timer_fire(struct k_itimer *timer)
+{
+ if (unlikely(timer->sigq == NULL)) {
+ /*
+ * This a special case for clock_nanosleep,
+ * not a normal timer from sys_timer_create.
+ */
+ wake_up_process(timer->it_process);
+ timer->it.cpu.expires.sched = 0;
+ } else if (timer->it.cpu.incr.sched == 0) {
+ /*
+ * One-shot timer. Clear it as soon as it's fired.
+ */
+ posix_timer_event(timer, 0);
+ timer->it.cpu.expires.sched = 0;
+ } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+ /*
+ * The signal did not get queued because the signal
+ * was ignored, so we won't get any callback to
+ * reload the timer. But we need to keep it
+ * ticking in case the signal is deliverable next time.
+ */
+ posix_cpu_timer_schedule(timer);
+ }
+}
+
+/*
+ * Guts of sys_timer_settime for CPU timers.
+ * This is called with the timer locked and interrupts disabled.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+ struct itimerspec *new, struct itimerspec *old)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ union cpu_time_count old_expires, new_expires, val;
+ int ret;
+
+ if (unlikely(p == NULL)) {
+ /*
+ * Timer refers to a dead task's clock.
+ */
+ return -ESRCH;
+ }
+
+ new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+
+ read_lock(&tasklist_lock);
+ /*
+ * We need the tasklist_lock to protect against reaping that
+ * clears p->signal. If p has just been reaped, we can no
+ * longer get any information about it at all.
+ */
+ if (unlikely(p->signal == NULL)) {
+ read_unlock(&tasklist_lock);
+ put_task_struct(p);
+ timer->it.cpu.task = NULL;
+ return -ESRCH;
+ }
+
+ /*
+ * Disarm any old timer after extracting its expiry time.
+ */
+ BUG_ON(!irqs_disabled());
+
+ ret = 0;
+ spin_lock(&p->sighand->siglock);
+ old_expires = timer->it.cpu.expires;
+ if (unlikely(timer->it.cpu.firing)) {
+ timer->it.cpu.firing = -1;
+ ret = TIMER_RETRY;
+ } else
+ list_del_init(&timer->it.cpu.entry);
+ spin_unlock(&p->sighand->siglock);
+
+ /*
+ * We need to sample the current value to convert the new
+ * value from to relative and absolute, and to convert the
+ * old value from absolute to relative. To set a process
+ * timer, we need a sample to balance the thread expiry
+ * times (in arm_timer). With an absolute time, we must
+ * check if it's already passed. In short, we need a sample.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &val);
+ } else {
+ cpu_clock_sample_group(timer->it_clock, p, &val);
+ }
+
+ if (old) {
+ if (old_expires.sched == 0) {
+ old->it_value.tv_sec = 0;
+ old->it_value.tv_nsec = 0;
+ } else {
+ /*
+ * Update the timer in case it has
+ * overrun already. If it has,
+ * we'll report it as having overrun
+ * and with the next reloaded timer
+ * already ticking, though we are
+ * swallowing that pending
+ * notification here to install the
+ * new setting.
+ */
+ bump_cpu_timer(timer, val);
+ if (cpu_time_before(timer->it_clock, val,
+ timer->it.cpu.expires)) {
+ old_expires = cpu_time_sub(
+ timer->it_clock,
+ timer->it.cpu.expires, val);
+ sample_to_timespec(timer->it_clock,
+ old_expires,
+ &old->it_value);
+ } else {
+ old->it_value.tv_nsec = 1;
+ old->it_value.tv_sec = 0;
+ }
+ }
+ }
+
+ if (unlikely(ret)) {
+ /*
+ * We are colliding with the timer actually firing.
+ * Punt after filling in the timer's old value, and
+ * disable this firing since we are already reporting
+ * it as an overrun (thanks to bump_cpu_timer above).
+ */
+ read_unlock(&tasklist_lock);
+ goto out;
+ }
+
+ if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
+ cpu_time_add(timer->it_clock, &new_expires, val);
+ }
+
+ /*
+ * Install the new expiry time (or zero).
+ * For a timer with no notification action, we don't actually
+ * arm the timer (we'll just fake it for timer_gettime).
+ */
+ timer->it.cpu.expires = new_expires;
+ if (new_expires.sched != 0 &&
+ (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+ cpu_time_before(timer->it_clock, val, new_expires)) {
+ arm_timer(timer, val);
+ }
+
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Install the new reload setting, and
+ * set up the signal and overrun bookkeeping.
+ */
+ timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
+ &new->it_interval);
+
+ /*
+ * This acts as a modification timestamp for the timer,
+ * so any automatic reload attempt will punt on seeing
+ * that we have reset the timer manually.
+ */
+ timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timer->it_overrun_last = 0;
+ timer->it_overrun = -1;
+
+ if (new_expires.sched != 0 &&
+ (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+ !cpu_time_before(timer->it_clock, val, new_expires)) {
+ /*
+ * The designated time already passed, so we notify
+ * immediately, even if the thread never runs to
+ * accumulate more time on this clock.
+ */
+ cpu_timer_fire(timer);
+ }
+
+ ret = 0;
+ out:
+ if (old) {
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.incr, &old->it_interval);
+ }
+ return ret;
+}
+
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+{
+ union cpu_time_count now;
+ struct task_struct *p = timer->it.cpu.task;
+ int clear_dead;
+
+ /*
+ * Easy part: convert the reload time.
+ */
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.incr, &itp->it_interval);
+
+ if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
+ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+ return;
+ }
+
+ if (unlikely(p == NULL)) {
+ /*
+ * This task already died and the timer will never fire.
+ * In this case, expires is actually the dead value.
+ */
+ dead:
+ sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+ &itp->it_value);
+ return;
+ }
+
+ /*
+ * Sample the clock to take the difference with the expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ clear_dead = p->exit_state;
+ } else {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ * Call the timer disarmed, nothing else to do.
+ */
+ put_task_struct(p);
+ timer->it.cpu.task = NULL;
+ timer->it.cpu.expires.sched = 0;
+ read_unlock(&tasklist_lock);
+ goto dead;
+ } else {
+ cpu_clock_sample_group(timer->it_clock, p, &now);
+ clear_dead = (unlikely(p->exit_state) &&
+ thread_group_empty(p));
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+ if (timer->it.cpu.incr.sched == 0 &&
+ cpu_time_before(timer->it_clock,
+ timer->it.cpu.expires, now)) {
+ /*
+ * Do-nothing timer expired and has no reload,
+ * so it's as if it was never set.
+ */
+ timer->it.cpu.expires.sched = 0;
+ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+ return;
+ }
+ /*
+ * Account for any expirations and reloads that should
+ * have happened.
+ */
+ bump_cpu_timer(timer, now);
+ }
+
+ if (unlikely(clear_dead)) {
+ /*
+ * We've noticed that the thread is dead, but
+ * not yet reaped. Take this opportunity to
+ * drop our task ref.
+ */
+ clear_dead_task(timer, now);
+ goto dead;
+ }
+
+ if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+ sample_to_timespec(timer->it_clock,
+ cpu_time_sub(timer->it_clock,
+ timer->it.cpu.expires, now),
+ &itp->it_value);
+ } else {
+ /*
+ * The timer should have expired already, but the firing
+ * hasn't taken place yet. Say it's just about to expire.
+ */
+ itp->it_value.tv_nsec = 1;
+ itp->it_value.tv_sec = 0;
+ }
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them off
+ * the tsk->cpu_timers[N] list onto the firing list. Here we update the
+ * tsk->it_*_expires values to reflect the remaining thread CPU timers.
+ */
+static void check_thread_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ int maxfire;
+ struct list_head *timers = tsk->cpu_timers;
+ struct signal_struct *const sig = tsk->signal;
+
+ maxfire = 20;
+ tsk->cputime_expires.prof_exp = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+ tsk->cputime_expires.prof_exp = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ maxfire = 20;
+ tsk->cputime_expires.virt_exp = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+ tsk->cputime_expires.virt_exp = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ maxfire = 20;
+ tsk->cputime_expires.sched_exp = 0;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+ tsk->cputime_expires.sched_exp = t->expires.sched;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ /*
+ * Check for the special case thread timers.
+ */
+ if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+ unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+ unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+
+ if (hard != RLIM_INFINITY &&
+ tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ /*
+ * At the hard limit, we just die.
+ * No need to calculate anything else now.
+ */
+ __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ return;
+ }
+ if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+ /*
+ * At the soft limit, send a SIGXCPU every second.
+ */
+ if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+ < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+ sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+ USEC_PER_SEC;
+ }
+ printk(KERN_INFO
+ "RT Watchdog Timeout: %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ }
+ }
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them
+ * off the tsk->*_timers list onto the firing list. Per-thread timers
+ * have already been taken off.
+ */
+static void check_process_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ int maxfire;
+ struct signal_struct *const sig = tsk->signal;
+ cputime_t utime, ptime, virt_expires, prof_expires;
+ unsigned long long sum_sched_runtime, sched_expires;
+ struct list_head *timers = sig->cpu_timers;
+ struct task_cputime cputime;
+
+ /*
+ * Don't sample the current process CPU clocks if there are no timers.
+ */
+ if (list_empty(&timers[CPUCLOCK_PROF]) &&
+ cputime_eq(sig->it_prof_expires, cputime_zero) &&
+ sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
+ list_empty(&timers[CPUCLOCK_VIRT]) &&
+ cputime_eq(sig->it_virt_expires, cputime_zero) &&
+ list_empty(&timers[CPUCLOCK_SCHED]))
+ return;
+
+ /*
+ * Collect the current process totals.
+ */
+ thread_group_cputime(tsk, &cputime);
+ utime = cputime.utime;
+ ptime = cputime_add(utime, cputime.stime);
+ sum_sched_runtime = cputime.sum_exec_runtime;
+ maxfire = 20;
+ prof_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *tl = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+ prof_expires = tl->expires.cpu;
+ break;
+ }
+ tl->firing = 1;
+ list_move_tail(&tl->entry, firing);
+ }
+
+ ++timers;
+ maxfire = 20;
+ virt_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *tl = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+ virt_expires = tl->expires.cpu;
+ break;
+ }
+ tl->firing = 1;
+ list_move_tail(&tl->entry, firing);
+ }
+
+ ++timers;
+ maxfire = 20;
+ sched_expires = 0;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *tl = list_first_entry(timers,
+ struct cpu_timer_list,
+ entry);
+ if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
+ sched_expires = tl->expires.sched;
+ break;
+ }
+ tl->firing = 1;
+ list_move_tail(&tl->entry, firing);
+ }
+
+ /*
+ * Check for the special case process timers.
+ */
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+ if (cputime_ge(ptime, sig->it_prof_expires)) {
+ /* ITIMER_PROF fires and reloads. */
+ sig->it_prof_expires = sig->it_prof_incr;
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+ sig->it_prof_expires = cputime_add(
+ sig->it_prof_expires, ptime);
+ }
+ __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
+ }
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
+ (cputime_eq(prof_expires, cputime_zero) ||
+ cputime_lt(sig->it_prof_expires, prof_expires))) {
+ prof_expires = sig->it_prof_expires;
+ }
+ }
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+ if (cputime_ge(utime, sig->it_virt_expires)) {
+ /* ITIMER_VIRTUAL fires and reloads. */
+ sig->it_virt_expires = sig->it_virt_incr;
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+ sig->it_virt_expires = cputime_add(
+ sig->it_virt_expires, utime);
+ }
+ __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
+ }
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
+ (cputime_eq(virt_expires, cputime_zero) ||
+ cputime_lt(sig->it_virt_expires, virt_expires))) {
+ virt_expires = sig->it_virt_expires;
+ }
+ }
+ if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+ unsigned long psecs = cputime_to_secs(ptime);
+ cputime_t x;
+ if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+ /*
+ * At the hard limit, we just die.
+ * No need to calculate anything else now.
+ */
+ __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ return;
+ }
+ if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+ /*
+ * At the soft limit, send a SIGXCPU every second.
+ */
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ if (sig->rlim[RLIMIT_CPU].rlim_cur
+ < sig->rlim[RLIMIT_CPU].rlim_max) {
+ sig->rlim[RLIMIT_CPU].rlim_cur++;
+ }
+ }
+ x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ if (cputime_eq(prof_expires, cputime_zero) ||
+ cputime_lt(x, prof_expires)) {
+ prof_expires = x;
+ }
+ }
+
+ if (!cputime_eq(prof_expires, cputime_zero) &&
+ (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
+ cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
+ sig->cputime_expires.prof_exp = prof_expires;
+ if (!cputime_eq(virt_expires, cputime_zero) &&
+ (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
+ cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
+ sig->cputime_expires.virt_exp = virt_expires;
+ if (sched_expires != 0 &&
+ (sig->cputime_expires.sched_exp == 0 ||
+ sig->cputime_expires.sched_exp > sched_expires))
+ sig->cputime_expires.sched_exp = sched_expires;
+}
+
+/*
+ * This is called from the signal code (via do_schedule_next_timer)
+ * when the last timer signal was delivered and we have to reload the timer.
+ */
+void posix_cpu_timer_schedule(struct k_itimer *timer)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ union cpu_time_count now;
+
+ if (unlikely(p == NULL))
+ /*
+ * The task was cleaned up already, no future firings.
+ */
+ goto out;
+
+ /*
+ * Fetch the current sample and update the timer's expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ if (unlikely(p->exit_state)) {
+ clear_dead_task(timer, now);
+ goto out;
+ }
+ read_lock(&tasklist_lock); /* arm_timer needs it. */
+ } else {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ */
+ put_task_struct(p);
+ timer->it.cpu.task = p = NULL;
+ timer->it.cpu.expires.sched = 0;
+ goto out_unlock;
+ } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
+ /*
+ * We've noticed that the thread is dead, but
+ * not yet reaped. Take this opportunity to
+ * drop our task ref.
+ */
+ clear_dead_task(timer, now);
+ goto out_unlock;
+ }
+ cpu_clock_sample_group(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ /* Leave the tasklist_lock locked for the call below. */
+ }
+
+ /*
+ * Now re-arm for the new expiry time.
+ */
+ arm_timer(timer, now);
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+
+out:
+ timer->it_overrun_last = timer->it_overrun;
+ timer->it_overrun = -1;
+ ++timer->it_requeue_pending;
+}
+
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (cputime_eq(cputime->utime, cputime_zero) &&
+ cputime_eq(cputime->stime, cputime_zero) &&
+ cputime->sum_exec_runtime == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample: The task_cputime structure to be checked for expiration.
+ * @expires: Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set. Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+ const struct task_cputime *expires)
+{
+ if (!cputime_eq(expires->utime, cputime_zero) &&
+ cputime_ge(sample->utime, expires->utime))
+ return 1;
+ if (!cputime_eq(expires->stime, cputime_zero) &&
+ cputime_ge(cputime_add(sample->utime, sample->stime),
+ expires->stime))
+ return 1;
+ if (expires->sum_exec_runtime != 0 &&
+ sample->sum_exec_runtime >= expires->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk: The task (thread) being checked.
+ *
+ * Check the task and thread group timers. If both are zero (there are no
+ * timers set) return false. Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times. Return
+ * true if a timer has expired, else return false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk)
+{
+ struct signal_struct *sig;
+
+ /* tsk == current, ensure it is safe to use ->signal/sighand */
+ if (unlikely(tsk->exit_state))
+ return 0;
+
+ if (!task_cputime_zero(&tsk->cputime_expires)) {
+ struct task_cputime task_sample = {
+ .utime = tsk->utime,
+ .stime = tsk->stime,
+ .sum_exec_runtime = tsk->se.sum_exec_runtime
+ };
+
+ if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+ return 1;
+ }
+
+ sig = tsk->signal;
+ if (!task_cputime_zero(&sig->cputime_expires)) {
+ struct task_cputime group_sample;
+
+ thread_group_cputime(tsk, &group_sample);
+ if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ LIST_HEAD(firing);
+ struct k_itimer *timer, *next;
+
+ BUG_ON(!irqs_disabled());
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ spin_lock(&tsk->sighand->siglock);
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+ check_process_timers(tsk, &firing);
+
+ /*
+ * We must release these locks before taking any timer's lock.
+ * There is a potential race with timer deletion here, as the
+ * siglock now protects our private firing list. We have set
+ * the firing flag in each timer, so that a deletion attempt
+ * that gets the timer lock before we do will give it up and
+ * spin until we've taken care of that timer below.
+ */
+ spin_unlock(&tsk->sighand->siglock);
+
+ /*
+ * Now that all the timers on our list have the firing flag,
+ * noone will touch their list entries but us. We'll take
+ * each timer's lock before clearing its firing flag, so no
+ * timer call will interfere.
+ */
+ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+ int firing;
+ spin_lock(&timer->it_lock);
+ list_del_init(&timer->it.cpu.entry);
+ firing = timer->it.cpu.firing;
+ timer->it.cpu.firing = 0;
+ /*
+ * The firing flag is -1 if we collided with a reset
+ * of the timer, which already reported this
+ * almost-firing as an overrun. So don't generate an event.
+ */
+ if (likely(firing >= 0)) {
+ cpu_timer_fire(timer);
+ }
+ spin_unlock(&timer->it_lock);
+ }
+}
+
+/*
+ * Set one of the process-wide special case CPU timers.
+ * The tsk->sighand->siglock must be held by the caller.
+ * The *newval argument is relative and we update it to be absolute, *oldval
+ * is absolute and we update it to be relative.
+ */
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+ cputime_t *newval, cputime_t *oldval)
+{
+ union cpu_time_count now;
+ struct list_head *head;
+
+ BUG_ON(clock_idx == CPUCLOCK_SCHED);
+ cpu_clock_sample_group(clock_idx, tsk, &now);
+
+ if (oldval) {
+ if (!cputime_eq(*oldval, cputime_zero)) {
+ if (cputime_le(*oldval, now.cpu)) {
+ /* Just about to fire. */
+ *oldval = jiffies_to_cputime(1);
+ } else {
+ *oldval = cputime_sub(*oldval, now.cpu);
+ }
+ }
+
+ if (cputime_eq(*newval, cputime_zero))
+ return;
+ *newval = cputime_add(*newval, now.cpu);
+
+ /*
+ * If the RLIMIT_CPU timer will expire before the
+ * ITIMER_PROF timer, we have nothing else to do.
+ */
+ if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
+ < cputime_to_secs(*newval))
+ return;
+ }
+
+ /*
+ * Check whether there are any process timers already set to fire
+ * before this one. If so, we don't have anything more to do.
+ */
+ head = &tsk->signal->cpu_timers[clock_idx];
+ if (list_empty(head) ||
+ cputime_ge(list_first_entry(head,
+ struct cpu_timer_list, entry)->expires.cpu,
+ *newval)) {
+ switch (clock_idx) {
+ case CPUCLOCK_PROF:
+ tsk->signal->cputime_expires.prof_exp = *newval;
+ break;
+ case CPUCLOCK_VIRT:
+ tsk->signal->cputime_expires.virt_exp = *newval;
+ break;
+ }
+ }
+}
+
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct itimerspec *it)
+{
+ struct k_itimer timer;
+ int error;
+
+ /*
+ * Set up a temporary timer and then wait for it to go off.
+ */
+ memset(&timer, 0, sizeof timer);
+ spin_lock_init(&timer.it_lock);
+ timer.it_clock = which_clock;
+ timer.it_overrun = -1;
+ error = posix_cpu_timer_create(&timer);
+ timer.it_process = current;
+ if (!error) {
+ static struct itimerspec zero_it;
+
+ memset(it, 0, sizeof *it);
+ it->it_value = *rqtp;
+
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_set(&timer, flags, it, NULL);
+ if (error) {
+ spin_unlock_irq(&timer.it_lock);
+ return error;
+ }
+
+ while (!signal_pending(current)) {
+ if (timer.it.cpu.expires.sched == 0) {
+ /*
+ * Our timer fired and was reset.
+ */
+ spin_unlock_irq(&timer.it_lock);
+ return 0;
+ }
+
+ /*
+ * Block until cpu_timer_fire (or a signal) wakes us.
+ */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&timer.it_lock);
+ schedule();
+ spin_lock_irq(&timer.it_lock);
+ }
+
+ /*
+ * We were interrupted by a signal.
+ */
+ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+ posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ spin_unlock_irq(&timer.it_lock);
+
+ if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
+ /*
+ * It actually did fire already.
+ */
+ return 0;
+ }
+
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+ struct itimerspec it;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+
+ if (flags & TIMER_ABSTIME)
+ return -ERESTARTNOHAND;
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->arg0 = which_clock;
+ restart_block->arg1 = (unsigned long) rmtp;
+ restart_block->arg2 = rqtp->tv_sec;
+ restart_block->arg3 = rqtp->tv_nsec;
+ }
+ return error;
+}
+
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+ struct timespec __user *rmtp;
+ struct timespec t;
+ struct itimerspec it;
+ int error;
+
+ rmtp = (struct timespec __user *) restart_block->arg1;
+ t.tv_sec = restart_block->arg2;
+ t.tv_nsec = restart_block->arg3;
+
+ restart_block->fn = do_no_restart_syscall;
+ error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->arg0 = which_clock;
+ restart_block->arg1 = (unsigned long) rmtp;
+ restart_block->arg2 = t.tv_sec;
+ restart_block->arg3 = t.tv_nsec;
+ }
+ return error;
+
+}
+
+
+#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int process_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = PROCESS_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp,
+ struct timespec __user *rmtp)
+{
+ return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
+}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+static int thread_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = THREAD_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ return -EINVAL;
+}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
+
+static __init int init_posix_cpu_timers(void)
+{
+ struct k_clock process = {
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
+ };
+ struct k_clock thread = {
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = thread_cpu_timer_create,
+ .nsleep = thread_cpu_nsleep,
+ .nsleep_restart = thread_cpu_nsleep_restart,
+ };
+
+ register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+ register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+ return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
new file mode 100644
index 0000000..6c48fc8
--- /dev/null
+++ b/kernel/posix-timers.c
@@ -0,0 +1,1000 @@
+/*
+ * linux/kernel/posix-timers.c
+ *
+ *
+ * 2002-10-15 Posix Clocks & timers
+ * by George Anzinger george@mvista.com
+ *
+ * Copyright (C) 2002 2003 by MontaVista Software.
+ *
+ * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
+ * Copyright (C) 2004 Boris Hu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
+ */
+
+/* These are all the functions necessary to implement
+ * POSIX clocks & timers
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/idr.h>
+#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+/*
+ * Management arrays for POSIX timers. Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+ * id and the timer. The external interface is:
+ *
+ * void *idr_find(struct idr *idp, int id); to find timer_id <id>
+ * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
+ * related it to <ptr>
+ * void idr_remove(struct idr *idp, int id); to release <id>
+ * void idr_init(struct idr *idp); to initialize <idp>
+ * which we supply.
+ * The idr_get_new *may* call slab for more memory so it must not be
+ * called under a spin lock. Likewise idr_remore may release memory
+ * (but it may be ok to do this under a lock...).
+ * idr_find is just a memory look up and is quite fast. A -1 return
+ * indicates that the requested id does not exist.
+ */
+
+/*
+ * Lets keep our timers in a slab cache :-)
+ */
+static struct kmem_cache *posix_timers_cache;
+static struct idr posix_timers_id;
+static DEFINE_SPINLOCK(idr_lock);
+
+/*
+ * we assume that the new SIGEV_THREAD_ID shares no bits with the other
+ * SIGEV values. Here we put out an error if this assumption fails.
+ */
+#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
+#endif
+
+
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ *
+ * a) checking that idr_find() returns other than -1.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+/*
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ * to implement others. This structure defines the various
+ * clocks and allows the possibility of adding others. We
+ * provide an interface to add clocks to the table and expect
+ * the "arch" code to add at least one clock that is high
+ * resolution. Here we define the standard CLOCK_REALTIME as a
+ * 1/HZ resolution clock.
+ *
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ * times, NOT to report clock times, which are reported with as
+ * much resolution as the system can muster. In some cases this
+ * resolution may depend on the underlying clock hardware and
+ * may not be quantifiable until run time, and only then is the
+ * necessary code is written. The standard says we should say
+ * something about this issue in the documentation...
+ *
+ * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ * various clock functions. For clocks that use the standard
+ * system timer code these entries should be NULL. This will
+ * allow dispatch without the overhead of indirect function
+ * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
+ * must supply functions here, even if the function just returns
+ * ENOSYS. The standard POSIX timer management code assumes the
+ * following: 1.) The k_itimer struct (sched.h) is used for the
+ * timer. 2.) The list, it_lock, it_clock, it_id and it_process
+ * fields are not modified by timer code.
+ *
+ * At this time all functions EXCEPT clock_nanosleep can be
+ * redirected by the CLOCKS structure. Clock_nanosleep is in
+ * there, but the code ignores it.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ * for each clock will take care of permission checks. Some
+ * clocks may be set able by any user (i.e. local process
+ * clocks) others not. Currently the only set able clock we
+ * have is CLOCK_REALTIME and its high res counter part, both of
+ * which we beg off on and pass to do_sys_settimeofday().
+ */
+
+static struct k_clock posix_clocks[MAX_CLOCKS];
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+ struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+ struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
+
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+
+static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+{
+ spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/*
+ * Call the k_clock hook function if non-null, or the default function.
+ */
+#define CLOCK_DISPATCH(clock, call, arglist) \
+ ((clock) < 0 ? posix_cpu_##call arglist : \
+ (posix_clocks[clock].call != NULL \
+ ? (*posix_clocks[clock].call) arglist : common_##call arglist))
+
+/*
+ * Default clock hook functions when the struct k_clock passed
+ * to register_posix_clock leaves a function pointer null.
+ *
+ * The function common_CALL is the default implementation for
+ * the function pointer CALL in struct k_clock.
+ */
+
+static inline int common_clock_getres(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ tp->tv_sec = 0;
+ tp->tv_nsec = posix_clocks[which_clock].res;
+ return 0;
+}
+
+/*
+ * Get real time for posix timers
+ */
+static int common_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ ktime_get_real_ts(tp);
+ return 0;
+}
+
+static inline int common_clock_set(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return do_sys_settimeofday(tp, NULL);
+}
+
+static int common_timer_create(struct k_itimer *new_timer)
+{
+ hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ return 0;
+}
+
+static int no_timer_create(struct k_itimer *new_timer)
+{
+ return -EOPNOTSUPP;
+}
+
+/*
+ * Return nonzero if we know a priori this clockid_t value is bogus.
+ */
+static inline int invalid_clockid(const clockid_t which_clock)
+{
+ if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
+ return 0;
+ if ((unsigned) which_clock >= MAX_CLOCKS)
+ return 1;
+ if (posix_clocks[which_clock].clock_getres != NULL)
+ return 0;
+ if (posix_clocks[which_clock].res != 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+ ktime_get_ts(tp);
+ return 0;
+}
+
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+ getrawmonotonic(tp);
+ return 0;
+}
+
+/*
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+static __init int init_posix_timers(void)
+{
+ struct k_clock clock_realtime = {
+ .clock_getres = hrtimer_get_res,
+ };
+ struct k_clock clock_monotonic = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_ktime_get_ts,
+ .clock_set = do_posix_clock_nosettime,
+ };
+ struct k_clock clock_monotonic_raw = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_raw,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };
+
+ register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+ register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof (struct k_itimer), 0, SLAB_PANIC,
+ NULL);
+ idr_init(&posix_timers_id);
+ return 0;
+}
+
+__initcall(init_posix_timers);
+
+static void schedule_next_timer(struct k_itimer *timr)
+{
+ struct hrtimer *timer = &timr->it.real.timer;
+
+ if (timr->it.real.interval.tv64 == 0)
+ return;
+
+ timr->it_overrun += (unsigned int) hrtimer_forward(timer,
+ timer->base->get_time(),
+ timr->it.real.interval);
+
+ timr->it_overrun_last = timr->it_overrun;
+ timr->it_overrun = -1;
+ ++timr->it_requeue_pending;
+ hrtimer_restart(timer);
+}
+
+/*
+ * This function is exported for use by the signal deliver code. It is
+ * called just prior to the info block being released and passes that
+ * block to us. It's function is to update the overrun entry AND to
+ * restart the timer. It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect aginst the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+ */
+void do_schedule_next_timer(struct siginfo *info)
+{
+ struct k_itimer *timr;
+ unsigned long flags;
+
+ timr = lock_timer(info->si_tid, &flags);
+
+ if (timr && timr->it_requeue_pending == info->si_sys_private) {
+ if (timr->it_clock < 0)
+ posix_cpu_timer_schedule(timr);
+ else
+ schedule_next_timer(timr);
+
+ info->si_overrun += timr->it_overrun_last;
+ }
+
+ if (timr)
+ unlock_timer(timr, flags);
+}
+
+int posix_timer_event(struct k_itimer *timr, int si_private)
+{
+ int shared, ret;
+ /*
+ * FIXME: if ->sigq is queued we can race with
+ * dequeue_signal()->do_schedule_next_timer().
+ *
+ * If dequeue_signal() sees the "right" value of
+ * si_sys_private it calls do_schedule_next_timer().
+ * We re-queue ->sigq and drop ->it_lock().
+ * do_schedule_next_timer() locks the timer
+ * and re-schedules it while ->sigq is pending.
+ * Not really bad, but not that we want.
+ */
+ timr->sigq->info.si_sys_private = si_private;
+
+ shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+ ret = send_sigqueue(timr->sigq, timr->it_process, shared);
+ /* If we failed to send the signal the timer stops. */
+ return ret > 0;
+}
+EXPORT_SYMBOL_GPL(posix_timer_event);
+
+/*
+ * This function gets called when a POSIX.1b interval timer expires. It
+ * is used as a callback from the kernel internal timer. The
+ * run_timer_list code ALWAYS calls with interrupts on.
+
+ * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ */
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
+{
+ struct k_itimer *timr;
+ unsigned long flags;
+ int si_private = 0;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+ timr = container_of(timer, struct k_itimer, it.real.timer);
+ spin_lock_irqsave(&timr->it_lock, flags);
+
+ if (timr->it.real.interval.tv64 != 0)
+ si_private = ++timr->it_requeue_pending;
+
+ if (posix_timer_event(timr, si_private)) {
+ /*
+ * signal was not sent because of sig_ignor
+ * we will not get a call back to restart it AND
+ * it should be restarted.
+ */
+ if (timr->it.real.interval.tv64 != 0) {
+ ktime_t now = hrtimer_cb_get_time(timer);
+
+ /*
+ * FIXME: What we really want, is to stop this
+ * timer completely and restart it in case the
+ * SIG_IGN is removed. This is a non trivial
+ * change which involves sighand locking
+ * (sigh !), which we don't want to do late in
+ * the release cycle.
+ *
+ * For now we just let timers with an interval
+ * less than a jiffie expire every jiffie to
+ * avoid softirq starvation in case of SIG_IGN
+ * and a very small interval, which would put
+ * the timer right back on the softirq pending
+ * list. By moving now ahead of time we trick
+ * hrtimer_forward() to expire the timer
+ * later, while we still maintain the overrun
+ * accuracy, but have some inconsistency in
+ * the timer_gettime() case. This is at least
+ * better than a starved softirq. A more
+ * complex fix which solves also another related
+ * inconsistency is already in the pipeline.
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+ {
+ ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+
+ if (timr->it.real.interval.tv64 < kj.tv64)
+ now = ktime_add(now, kj);
+ }
+#endif
+ timr->it_overrun += (unsigned int)
+ hrtimer_forward(timer, now,
+ timr->it.real.interval);
+ ret = HRTIMER_RESTART;
+ ++timr->it_requeue_pending;
+ }
+ }
+
+ unlock_timer(timr, flags);
+ return ret;
+}
+
+static struct task_struct * good_sigevent(sigevent_t * event)
+{
+ struct task_struct *rtn = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+ (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
+ !same_thread_group(rtn, current) ||
+ (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ return NULL;
+
+ if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+ ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+ return NULL;
+
+ return rtn;
+}
+
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+{
+ if ((unsigned) clock_id >= MAX_CLOCKS) {
+ printk("POSIX clock register failed for clock_id %d\n",
+ clock_id);
+ return;
+ }
+
+ posix_clocks[clock_id] = *new_clock;
+}
+EXPORT_SYMBOL_GPL(register_posix_clock);
+
+static struct k_itimer * alloc_posix_timer(void)
+{
+ struct k_itimer *tmr;
+ tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ if (!tmr)
+ return tmr;
+ if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+ kmem_cache_free(posix_timers_cache, tmr);
+ return NULL;
+ }
+ memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
+ return tmr;
+}
+
+#define IT_ID_SET 1
+#define IT_ID_NOT_SET 0
+static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+{
+ if (it_id_set) {
+ unsigned long flags;
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&posix_timers_id, tmr->it_id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+ sigqueue_free(tmr->sigq);
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
+/* Create a POSIX.1b interval timer. */
+
+SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
+ struct sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
+{
+ struct k_itimer *new_timer;
+ int error, new_timer_id;
+ struct task_struct *process;
+ sigevent_t event;
+ int it_id_set = IT_ID_NOT_SET;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+ retry:
+ if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&idr_lock);
+ error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+ spin_unlock_irq(&idr_lock);
+ if (error) {
+ if (error == -EAGAIN)
+ goto retry;
+ /*
+ * Weird looking, but we return EAGAIN if the IDR is
+ * full (proper POSIX return value for this)
+ */
+ error = -EAGAIN;
+ goto out;
+ }
+
+ it_id_set = IT_ID_SET;
+ new_timer->it_id = (timer_t) new_timer_id;
+ new_timer->it_clock = which_clock;
+ new_timer->it_overrun = -1;
+ error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ if (error)
+ goto out;
+
+ /*
+ * return the timer_id now. The next step is hard to
+ * back out if there is an error.
+ */
+ if (copy_to_user(created_timer_id,
+ &new_timer_id, sizeof (new_timer_id))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (timer_event_spec) {
+ if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+ error = -EFAULT;
+ goto out;
+ }
+ rcu_read_lock();
+ process = good_sigevent(&event);
+ if (process)
+ get_task_struct(process);
+ rcu_read_unlock();
+ if (!process) {
+ error = -EINVAL;
+ goto out;
+ }
+ } else {
+ event.sigev_notify = SIGEV_SIGNAL;
+ event.sigev_signo = SIGALRM;
+ event.sigev_value.sival_int = new_timer->it_id;
+ process = current->group_leader;
+ get_task_struct(process);
+ }
+
+ new_timer->it_sigev_notify = event.sigev_notify;
+ new_timer->sigq->info.si_signo = event.sigev_signo;
+ new_timer->sigq->info.si_value = event.sigev_value;
+ new_timer->sigq->info.si_tid = new_timer->it_id;
+ new_timer->sigq->info.si_code = SI_TIMER;
+
+ spin_lock_irq(&current->sighand->siglock);
+ new_timer->it_process = process;
+ list_add(&new_timer->list, &current->signal->posix_timers);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return 0;
+ /*
+ * In the case of the timer belonging to another task, after
+ * the task is unlocked, the timer is owned by the other task
+ * and may cease to exist at any time. Don't use or modify
+ * new_timer after the unlock call.
+ */
+out:
+ release_posix_timer(new_timer, it_id_set);
+ return error;
+}
+
+/*
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us. The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock. To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+ */
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+{
+ struct k_itimer *timr;
+ /*
+ * Watch out here. We do a irqsave on the idr_lock and pass the
+ * flags part over to the timer lock. Must not let interrupts in
+ * while we are moving the lock.
+ */
+ spin_lock_irqsave(&idr_lock, *flags);
+ timr = idr_find(&posix_timers_id, (int)timer_id);
+ if (timr) {
+ spin_lock(&timr->it_lock);
+ if (timr->it_process &&
+ same_thread_group(timr->it_process, current)) {
+ spin_unlock(&idr_lock);
+ return timr;
+ }
+ spin_unlock(&timr->it_lock);
+ }
+ spin_unlock_irqrestore(&idr_lock, *flags);
+
+ return NULL;
+}
+
+/*
+ * Get the time remaining on a POSIX.1b interval timer. This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+ *
+ * We have a couple of messes to clean up here. First there is the case
+ * of a timer that has a requeue pending. These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+ *
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here. Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+ */
+static void
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+ ktime_t now, remaining, iv;
+ struct hrtimer *timer = &timr->it.real.timer;
+
+ memset(cur_setting, 0, sizeof(struct itimerspec));
+
+ iv = timr->it.real.interval;
+
+ /* interval timer ? */
+ if (iv.tv64)
+ cur_setting->it_interval = ktime_to_timespec(iv);
+ else if (!hrtimer_active(timer) &&
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ return;
+
+ now = timer->base->get_time();
+
+ /*
+ * When a requeue is pending or this is a SIGEV_NONE
+ * timer move the expiry time forward by intervals, so
+ * expiry is > now.
+ */
+ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
+
+ remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ /* Return 0 only, when the timer is expired and not pending */
+ if (remaining.tv64 <= 0) {
+ /*
+ * A single shot SIGEV_NONE timer must return 0, when
+ * it is expired !
+ */
+ if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ cur_setting->it_value.tv_nsec = 1;
+ } else
+ cur_setting->it_value = ktime_to_timespec(remaining);
+}
+
+/* Get the time remaining on a POSIX.1b interval timer. */
+SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct itimerspec __user *, setting)
+{
+ struct k_itimer *timr;
+ struct itimerspec cur_setting;
+ unsigned long flags;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+
+ unlock_timer(timr, flags);
+
+ if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Get the number of overruns of a POSIX.1b interval timer. This is to
+ * be the overrun of the timer last delivered. At the same time we are
+ * accumulating overruns on the next timer. The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer(). So all we need to do is
+ * to pick up the frozen overrun.
+ */
+SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
+{
+ struct k_itimer *timr;
+ int overrun;
+ unsigned long flags;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ overrun = timr->it_overrun_last;
+ unlock_timer(timr, flags);
+
+ return overrun;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static int
+common_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+ struct hrtimer *timer = &timr->it.real.timer;
+ enum hrtimer_mode mode;
+
+ if (old_setting)
+ common_timer_get(timr, old_setting);
+
+ /* disable the timer */
+ timr->it.real.interval.tv64 = 0;
+ /*
+ * careful here. If smp we could be in the "fire" routine which will
+ * be spinning as we hold the lock. But this is ONLY an SMP issue.
+ */
+ if (hrtimer_try_to_cancel(timer) < 0)
+ return TIMER_RETRY;
+
+ timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timr->it_overrun_last = 0;
+
+ /* switch off the timer when it_value is zero */
+ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
+ return 0;
+
+ mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+ hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+ timr->it.real.timer.function = posix_timer_fn;
+
+ hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+
+ /* Convert interval */
+ timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /* SIGEV_NONE timers are not queued ! See common_timer_get */
+ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+ /* Setup correct expiry time for relative timers */
+ if (mode == HRTIMER_MODE_REL) {
+ hrtimer_add_expires(timer, timer->base->get_time());
+ }
+ return 0;
+ }
+
+ hrtimer_start_expires(timer, mode);
+ return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ const struct itimerspec __user *, new_setting,
+ struct itimerspec __user *, old_setting)
+{
+ struct k_itimer *timr;
+ struct itimerspec new_spec, old_spec;
+ int error = 0;
+ unsigned long flag;
+ struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+
+ if (!new_setting)
+ return -EINVAL;
+
+ if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ return -EFAULT;
+
+ if (!timespec_valid(&new_spec.it_interval) ||
+ !timespec_valid(&new_spec.it_value))
+ return -EINVAL;
+retry:
+ timr = lock_timer(timer_id, &flag);
+ if (!timr)
+ return -EINVAL;
+
+ error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+ (timr, flags, &new_spec, rtn));
+
+ unlock_timer(timr, flag);
+ if (error == TIMER_RETRY) {
+ rtn = NULL; // We already got the old time...
+ goto retry;
+ }
+
+ if (old_setting && !error &&
+ copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ error = -EFAULT;
+
+ return error;
+}
+
+static inline int common_timer_del(struct k_itimer *timer)
+{
+ timer->it.real.interval.tv64 = 0;
+
+ if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
+ return TIMER_RETRY;
+ return 0;
+}
+
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+ return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+}
+
+/* Delete a POSIX.1b interval timer. */
+SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
+{
+ struct k_itimer *timer;
+ unsigned long flags;
+
+retry_delete:
+ timer = lock_timer(timer_id, &flags);
+ if (!timer)
+ return -EINVAL;
+
+ if (timer_delete_hook(timer) == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+
+ spin_lock(&current->sighand->siglock);
+ list_del(&timer->list);
+ spin_unlock(&current->sighand->siglock);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+ return 0;
+}
+
+/*
+ * return timer owned by the process, used by exit_itimers
+ */
+static void itimer_delete(struct k_itimer *timer)
+{
+ unsigned long flags;
+
+retry_delete:
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+ if (timer_delete_hook(timer) == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+ list_del(&timer->list);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+}
+
+/*
+ * This is called by do_exit or de_thread, only when there are no more
+ * references to the shared signal_struct.
+ */
+void exit_itimers(struct signal_struct *sig)
+{
+ struct k_itimer *tmr;
+
+ while (!list_empty(&sig->posix_timers)) {
+ tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+ itimer_delete(tmr);
+ }
+}
+
+/* Not available / possible... functions */
+int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
+
+int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
+ struct timespec *t, struct timespec __user *r)
+{
+#ifndef ENOTSUP
+ return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
+#else /* parisc does define it separately. */
+ return -ENOTSUP;
+#endif
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
+
+SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+ const struct timespec __user *, tp)
+{
+ struct timespec new_tp;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+ if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ return -EFAULT;
+
+ return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+}
+
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *,tp)
+{
+ struct timespec kernel_tp;
+ int error;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+ error = CLOCK_DISPATCH(which_clock, clock_get,
+ (which_clock, &kernel_tp));
+ if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ error = -EFAULT;
+
+ return error;
+
+}
+
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+ struct timespec __user *, tp)
+{
+ struct timespec rtn_tp;
+ int error;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ error = CLOCK_DISPATCH(which_clock, clock_getres,
+ (which_clock, &rtn_tp));
+
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+ error = -EFAULT;
+ }
+
+ return error;
+}
+
+/*
+ * nanosleep for monotonic and realtime clocks
+ */
+static int common_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsave, struct timespec __user *rmtp)
+{
+ return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
+SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
+ const struct timespec __user *, rqtp,
+ struct timespec __user *, rmtp)
+{
+ struct timespec t;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ return -EFAULT;
+
+ if (!timespec_valid(&t))
+ return -EINVAL;
+
+ return CLOCK_DISPATCH(which_clock, nsleep,
+ (which_clock, flags, &t, rmtp));
+}
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+ return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+
+ return CLOCK_DISPATCH(which_clock, nsleep_restart,
+ (restart_block));
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 0000000..23bd4da
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,206 @@
+config PM
+ bool "Power Management support"
+ depends on !IA64_HP_SIM
+ ---help---
+ "Power Management" means that parts of your computer are shut
+ off or put into a power conserving "sleep" mode if they are not
+ being used. There are two competing standards for doing this: APM
+ and ACPI. If you want to use either one, say Y here and then also
+ to the requisite support below.
+
+ Power Management is most important for battery powered laptop
+ computers; if you have a laptop, check out the Linux Laptop home
+ page on the WWW at <http://www.linux-on-laptops.com/> or
+ Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+ and the Battery Powered Linux mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ Note that, even if you say N here, Linux on the x86 architecture
+ will issue the hlt instruction if nothing is to be done, thereby
+ sending the processor to sleep and saving power.
+
+config PM_DEBUG
+ bool "Power Management Debug Support"
+ depends on PM
+ ---help---
+ This option enables various debugging support in the Power Management
+ code. This is helpful when debugging and reporting PM bugs, like
+ suspend support.
+
+config PM_VERBOSE
+ bool "Verbose Power Management debugging"
+ depends on PM_DEBUG
+ default n
+ ---help---
+ This option enables verbose messages from the Power Management code.
+
+config CAN_PM_TRACE
+ def_bool y
+ depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
+
+config PM_TRACE
+ bool
+ help
+ This enables code to save the last PM event point across
+ reboot. The architecture needs to support this, x86 for
+ example does by saving things in the RTC, see below.
+
+ The architecture specific code must provide the extern
+ functions from <linux/resume-trace.h> as well as the
+ <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+
+ The way the information is presented is architecture-
+ dependent, x86 will print the information during a
+ late_initcall.
+
+config PM_TRACE_RTC
+ bool "Suspend/resume event tracing"
+ depends on CAN_PM_TRACE
+ depends on X86
+ select PM_TRACE
+ default n
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
+ To use this debugging feature you should attempt to suspend the
+ machine, reboot it and then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
+config PM_SLEEP_SMP
+ bool
+ depends on SMP
+ depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+ depends on PM_SLEEP
+ select HOTPLUG_CPU
+ default y
+
+config PM_SLEEP
+ bool
+ depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
+ default y
+
+config SUSPEND
+ bool "Suspend to RAM and standby"
+ depends on PM && ARCH_SUSPEND_POSSIBLE
+ default y
+ ---help---
+ Allow the system to enter sleep states in which main memory is
+ powered and thus its contents are preserved, such as the
+ suspend-to-RAM state (e.g. the ACPI S3 state).
+
+config PM_TEST_SUSPEND
+ bool "Test suspend/resume and wakealarm during bootup"
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+ ---help---
+ This option will let you suspend your machine during bootup, and
+ make it wake up a few seconds later using an RTC wakeup alarm.
+ Enable this with a kernel parameter like "test_suspend=mem".
+
+ You probably want to have your system's RTC driver statically
+ linked, ensuring that it's available when this test runs.
+
+config SUSPEND_FREEZER
+ bool "Enable freezer for suspend to RAM/standby" \
+ if ARCH_WANTS_FREEZER_CONTROL || BROKEN
+ depends on SUSPEND
+ default y
+ help
+ This allows you to turn off the freezer for suspend. If this is
+ done, no tasks are frozen for suspend to RAM/standby.
+
+ Turning OFF this setting is NOT recommended! If in doubt, say Y.
+
+config HIBERNATION
+ bool "Hibernation (aka 'suspend to disk')"
+ depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+ ---help---
+ Enable the suspend to disk (STD) functionality, which is usually
+ called "hibernation" in user interfaces. STD checkpoints the
+ system and powers it off; and restores that checkpoint on reboot.
+
+ You can suspend your machine with 'echo disk > /sys/power/state'
+ after placing resume=/dev/swappartition on the kernel command line
+ in your bootloader's configuration file.
+
+ Alternatively, you can use the additional userland tools available
+ from <http://suspend.sf.net>.
+
+ In principle it does not require ACPI or APM, although for example
+ ACPI will be used for the final steps when it is available. One
+ of the reasons to use software suspend is that the firmware hooks
+ for suspend states like suspend-to-RAM (STR) often don't work very
+ well with Linux.
+
+ It creates an image which is saved in your active swap. Upon the next
+ boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+ have it detect the saved image, restore memory state from it, and
+ continue to run as before. If you do not want the previous state to
+ be reloaded, then use the 'noresume' kernel command line argument.
+ Note, however, that fsck will be run on your filesystems and you will
+ need to run mkswap against the swap partition used for the suspend.
+
+ It also works with swap files to a limited extent (for details see
+ <file:Documentation/power/swsusp-and-swap-files.txt>).
+
+ Right now you may boot without resuming and resume later but in the
+ meantime you cannot use the swap partition(s)/file(s) involved in
+ suspending. Also in this case you must not use the filesystems
+ that were mounted before the suspend. In particular, you MUST NOT
+ MOUNT any journaled filesystems mounted before the suspend or they
+ will get corrupted in a nasty way.
+
+ For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config PM_STD_PARTITION
+ string "Default resume partition"
+ depends on HIBERNATION
+ default ""
+ ---help---
+ The default resume partition is the partition that the suspend-
+ to-disk implementation will look for a suspended disk image.
+
+ The partition specified here will be different for almost every user.
+ It should be a valid swap partition (at least for now) that is turned
+ on before suspending.
+
+ The partition specified can be overridden by specifying:
+
+ resume=/dev/<other device>
+
+ which will set the resume partition to the device specified.
+
+ Note there is currently not a way to specify which device to save the
+ suspended image to. It will simply pick the first available swap
+ device.
+
+config APM_EMULATION
+ tristate "Advanced Power Management Emulation"
+ depends on PM && SYS_SUPPORTS_APM_EMULATION
+ help
+ APM is a BIOS specification for saving power using several different
+ techniques. This is mostly useful for battery powered laptops with
+ APM compliant BIOSes. If you say Y here, the system time will be
+ reset after a RESUME operation, the /proc/apm device will provide
+ battery status information, and user-space programs will receive
+ notification of APM "events" (e.g. battery status change).
+
+ In order to use APM, you will need supporting software. For location
+ and more information, read <file:Documentation/power/pm.txt> and the
+ Battery Powered Linux mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ This driver does not spin down disk drives (see the hdparm(8)
+ manpage ("man 8 hdparm") for that), and it doesn't turn off
+ VESA-compliant "green" monitors.
+
+ Generally, if you don't have a battery in your machine, there isn't
+ much point in using this driver and you should say N. If you get
+ random kernel OOPSes or reboots that don't seem to be related to
+ anything, try disabling/enabling this option (or disabling/enabling
+ APM in your BIOS).
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 0000000..597823b
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,10 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+obj-y := main.o
+obj-$(CONFIG_PM_SLEEP) += process.o console.o
+obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
+
+obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 0000000..b8628be
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,83 @@
+/*
+ * drivers/power/process.c - Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/console.h>
+#include <linux/module.h>
+#include "power.h"
+
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
+
+static int orig_fgconsole, orig_kmsg;
+static int disable_vt_switch;
+
+/*
+ * Normally during a suspend, we allocate a new console and switch to it.
+ * When we resume, we switch back to the original console. This switch
+ * can be slow, so on systems where the framebuffer can handle restoration
+ * of video registers anyways, there's little point in doing the console
+ * switch. This function allows you to disable it by passing it '0'.
+ */
+void pm_set_vt_switch(int do_switch)
+{
+ acquire_console_sem();
+ disable_vt_switch = !do_switch;
+ release_console_sem();
+}
+EXPORT_SYMBOL(pm_set_vt_switch);
+
+int pm_prepare_console(void)
+{
+ acquire_console_sem();
+
+ if (disable_vt_switch) {
+ release_console_sem();
+ return 0;
+ }
+
+ orig_fgconsole = fg_console;
+
+ if (vc_allocate(SUSPEND_CONSOLE)) {
+ /* we can't have a free VC for now. Too bad,
+ * we don't want to mess the screen for now. */
+ release_console_sem();
+ return 1;
+ }
+
+ if (set_console(SUSPEND_CONSOLE)) {
+ /*
+ * We're unable to switch to the SUSPEND_CONSOLE.
+ * Let the calling function know so it can decide
+ * what to do.
+ */
+ release_console_sem();
+ return 1;
+ }
+ release_console_sem();
+
+ if (vt_waitactive(SUSPEND_CONSOLE)) {
+ pr_debug("Suspend: Can't switch VCs.");
+ return 1;
+ }
+ orig_kmsg = kmsg_redirect;
+ kmsg_redirect = SUSPEND_CONSOLE;
+ return 0;
+}
+
+void pm_restore_console(void)
+{
+ acquire_console_sem();
+ if (disable_vt_switch) {
+ release_console_sem();
+ return;
+ }
+ set_console(orig_fgconsole);
+ release_console_sem();
+ kmsg_redirect = orig_kmsg;
+}
+#endif
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
new file mode 100644
index 0000000..c9d7408
--- /dev/null
+++ b/kernel/power/disk.c
@@ -0,0 +1,901 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/kmod.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pm.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#include "power.h"
+
+
+static int noresume = 0;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+
+enum {
+ HIBERNATION_INVALID,
+ HIBERNATION_PLATFORM,
+ HIBERNATION_TEST,
+ HIBERNATION_TESTPROC,
+ HIBERNATION_SHUTDOWN,
+ HIBERNATION_REBOOT,
+ /* keep last */
+ __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+static struct platform_hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct platform_hibernation_ops *ops)
+{
+ if (ops && !(ops->begin && ops->end && ops->pre_snapshot
+ && ops->prepare && ops->finish && ops->enter && ops->pre_restore
+ && ops->restore_cleanup)) {
+ WARN_ON(1);
+ return;
+ }
+ mutex_lock(&pm_mutex);
+ hibernation_ops = ops;
+ if (ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else if (hibernation_mode == HIBERNATION_PLATFORM)
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+
+ mutex_unlock(&pm_mutex);
+}
+
+#ifdef CONFIG_PM_DEBUG
+static void hibernation_debug_sleep(void)
+{
+ printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+}
+
+static int hibernation_testmode(int mode)
+{
+ if (hibernation_mode == mode) {
+ hibernation_debug_sleep();
+ return 1;
+ }
+ return 0;
+}
+
+static int hibernation_test(int level)
+{
+ if (pm_test_level == level) {
+ hibernation_debug_sleep();
+ return 1;
+ }
+ return 0;
+}
+#else /* !CONFIG_PM_DEBUG */
+static int hibernation_testmode(int mode) { return 0; }
+static int hibernation_test(int level) { return 0; }
+#endif /* !CONFIG_PM_DEBUG */
+
+/**
+ * platform_begin - tell the platform driver that we're starting
+ * hibernation
+ */
+
+static int platform_begin(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->begin() : 0;
+}
+
+/**
+ * platform_end - tell the platform driver that we've entered the
+ * working state
+ */
+
+static void platform_end(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->end();
+}
+
+/**
+ * platform_pre_snapshot - prepare the machine for hibernation using the
+ * platform driver if so configured and return an error code if it fails
+ */
+
+static int platform_pre_snapshot(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->pre_snapshot() : 0;
+}
+
+/**
+ * platform_leave - prepare the machine for switching to the normal mode
+ * of operation using the platform driver (called with interrupts disabled)
+ */
+
+static void platform_leave(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->leave();
+}
+
+/**
+ * platform_finish - switch the machine to the normal mode of operation
+ * using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->finish();
+}
+
+/**
+ * platform_pre_restore - prepare the platform for the restoration from a
+ * hibernation image. If the restore fails after this function has been
+ * called, platform_restore_cleanup() must be called.
+ */
+
+static int platform_pre_restore(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->pre_restore() : 0;
+}
+
+/**
+ * platform_restore_cleanup - switch the platform to the normal mode of
+ * operation after a failing restore. If platform_pre_restore() has been
+ * called before the failing restore, this function must be called too,
+ * regardless of the result of platform_pre_restore().
+ */
+
+static void platform_restore_cleanup(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->restore_cleanup();
+}
+
+/**
+ * platform_recover - recover the platform from a failure to suspend
+ * devices.
+ */
+
+static void platform_recover(int platform_mode)
+{
+ if (platform_mode && hibernation_ops && hibernation_ops->recover)
+ hibernation_ops->recover();
+}
+
+/**
+ * create_image - freeze devices that need to be frozen with interrupts
+ * off, create the hibernation image and thaw those devices. Control
+ * reappears in this routine after a restore.
+ */
+
+static int create_image(int platform_mode)
+{
+ int error;
+
+ error = arch_prepare_suspend();
+ if (error)
+ return error;
+
+ device_pm_lock();
+ local_irq_disable();
+ /* At this point, device_suspend() has been called, but *not*
+ * device_power_down(). We *must* call device_power_down() now.
+ * Otherwise, drivers for some devices (e.g. interrupt controllers)
+ * become desynchronized with the actual state of the hardware
+ * at resume time, and evil weirdness ensues.
+ */
+ error = device_power_down(PMSG_FREEZE);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down, "
+ "aborting hibernation\n");
+ goto Enable_irqs;
+ }
+
+ if (hibernation_test(TEST_CORE))
+ goto Power_up;
+
+ in_suspend = 1;
+ save_processor_state();
+ error = swsusp_arch_suspend();
+ if (error)
+ printk(KERN_ERR "PM: Error %d creating hibernation image\n",
+ error);
+ /* Restore control flow magically appears here */
+ restore_processor_state();
+ if (!in_suspend)
+ platform_leave(platform_mode);
+ Power_up:
+ /* NOTE: device_power_up() is just a resume() for devices
+ * that suspended with irqs off ... no overall powerup.
+ */
+ device_power_up(in_suspend ?
+ (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ Enable_irqs:
+ local_irq_enable();
+ device_pm_unlock();
+ return error;
+}
+
+/**
+ * hibernation_snapshot - quiesce devices and create the hibernation
+ * snapshot image.
+ * @platform_mode - if set, use the platform driver, if available, to
+ * prepare the platform frimware for the power transition.
+ *
+ * Must be called with pm_mutex held
+ */
+
+int hibernation_snapshot(int platform_mode)
+{
+ int error, ftrace_save;
+
+ /* Free memory before shutting down devices. */
+ error = swsusp_shrink_memory();
+ if (error)
+ return error;
+
+ error = platform_begin(platform_mode);
+ if (error)
+ goto Close;
+
+ suspend_console();
+ ftrace_save = __ftrace_enabled_save();
+ error = device_suspend(PMSG_FREEZE);
+ if (error)
+ goto Recover_platform;
+
+ if (hibernation_test(TEST_DEVICES))
+ goto Recover_platform;
+
+ error = platform_pre_snapshot(platform_mode);
+ if (error || hibernation_test(TEST_PLATFORM))
+ goto Finish;
+
+ error = disable_nonboot_cpus();
+ if (!error) {
+ if (hibernation_test(TEST_CPUS))
+ goto Enable_cpus;
+
+ if (hibernation_testmode(HIBERNATION_TEST))
+ goto Enable_cpus;
+
+ error = create_image(platform_mode);
+ /* Control returns here after successful restore */
+ }
+ Enable_cpus:
+ enable_nonboot_cpus();
+ Finish:
+ platform_finish(platform_mode);
+ Resume_devices:
+ device_resume(in_suspend ?
+ (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ __ftrace_enabled_restore(ftrace_save);
+ resume_console();
+ Close:
+ platform_end(platform_mode);
+ return error;
+
+ Recover_platform:
+ platform_recover(platform_mode);
+ goto Resume_devices;
+}
+
+/**
+ * resume_target_kernel - prepare devices that need to be suspended with
+ * interrupts off, restore the contents of highmem that have not been
+ * restored yet from the image and run the low level code that will restore
+ * the remaining contents of memory and switch to the just restored target
+ * kernel.
+ */
+
+static int resume_target_kernel(void)
+{
+ int error;
+
+ device_pm_lock();
+ local_irq_disable();
+ error = device_power_down(PMSG_QUIESCE);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down, "
+ "aborting resume\n");
+ goto Enable_irqs;
+ }
+ /* We'll ignore saved state, but this gets preempt count (etc) right */
+ save_processor_state();
+ error = restore_highmem();
+ if (!error) {
+ error = swsusp_arch_resume();
+ /*
+ * The code below is only ever reached in case of a failure.
+ * Otherwise execution continues at place where
+ * swsusp_arch_suspend() was called
+ */
+ BUG_ON(!error);
+ /* This call to restore_highmem() undos the previous one */
+ restore_highmem();
+ }
+ /*
+ * The only reason why swsusp_arch_resume() can fail is memory being
+ * very tight, so we have to free it as soon as we can to avoid
+ * subsequent failures
+ */
+ swsusp_free();
+ restore_processor_state();
+ touch_softlockup_watchdog();
+ device_power_up(PMSG_RECOVER);
+ Enable_irqs:
+ local_irq_enable();
+ device_pm_unlock();
+ return error;
+}
+
+/**
+ * hibernation_restore - quiesce devices and restore the hibernation
+ * snapshot image. If successful, control returns in hibernation_snaphot()
+ * @platform_mode - if set, use the platform driver, if available, to
+ * prepare the platform frimware for the transition.
+ *
+ * Must be called with pm_mutex held
+ */
+
+int hibernation_restore(int platform_mode)
+{
+ int error, ftrace_save;
+
+ pm_prepare_console();
+ suspend_console();
+ ftrace_save = __ftrace_enabled_save();
+ error = device_suspend(PMSG_QUIESCE);
+ if (error)
+ goto Finish;
+
+ error = platform_pre_restore(platform_mode);
+ if (!error) {
+ error = disable_nonboot_cpus();
+ if (!error)
+ error = resume_target_kernel();
+ enable_nonboot_cpus();
+ }
+ platform_restore_cleanup(platform_mode);
+ device_resume(PMSG_RECOVER);
+ Finish:
+ __ftrace_enabled_restore(ftrace_save);
+ resume_console();
+ pm_restore_console();
+ return error;
+}
+
+/**
+ * hibernation_platform_enter - enter the hibernation state using the
+ * platform driver (if available)
+ */
+
+int hibernation_platform_enter(void)
+{
+ int error, ftrace_save;
+
+ if (!hibernation_ops)
+ return -ENOSYS;
+
+ /*
+ * We have cancelled the power transition by running
+ * hibernation_ops->finish() before saving the image, so we should let
+ * the firmware know that we're going to enter the sleep state after all
+ */
+ error = hibernation_ops->begin();
+ if (error)
+ goto Close;
+
+ suspend_console();
+ ftrace_save = __ftrace_enabled_save();
+ error = device_suspend(PMSG_HIBERNATE);
+ if (error) {
+ if (hibernation_ops->recover)
+ hibernation_ops->recover();
+ goto Resume_devices;
+ }
+
+ error = hibernation_ops->prepare();
+ if (error)
+ goto Resume_devices;
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Finish;
+
+ device_pm_lock();
+ local_irq_disable();
+ error = device_power_down(PMSG_HIBERNATE);
+ if (!error) {
+ hibernation_ops->enter();
+ /* We should never get here */
+ while (1);
+ }
+ local_irq_enable();
+ device_pm_unlock();
+
+ /*
+ * We don't need to reenable the nonboot CPUs or resume consoles, since
+ * the system is going to be halted anyway.
+ */
+ Finish:
+ hibernation_ops->finish();
+ Resume_devices:
+ device_resume(PMSG_RESTORE);
+ __ftrace_enabled_restore(ftrace_save);
+ resume_console();
+ Close:
+ hibernation_ops->end();
+ return error;
+}
+
+/**
+ * power_down - Shut the machine down for hibernation.
+ *
+ * Use the platform driver, if configured so; otherwise try
+ * to power off or reboot.
+ */
+
+static void power_down(void)
+{
+ switch (hibernation_mode) {
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ break;
+ case HIBERNATION_REBOOT:
+ kernel_restart(NULL);
+ break;
+ case HIBERNATION_PLATFORM:
+ hibernation_platform_enter();
+ case HIBERNATION_SHUTDOWN:
+ kernel_power_off();
+ break;
+ }
+ kernel_halt();
+ /*
+ * Valid image is on the disk, if we continue we risk serious data
+ * corruption after resume.
+ */
+ printk(KERN_CRIT "PM: Please power down manually\n");
+ while(1);
+}
+
+static int prepare_processes(void)
+{
+ int error = 0;
+
+ if (freeze_processes()) {
+ error = -EBUSY;
+ thaw_processes();
+ }
+ return error;
+}
+
+/**
+ * hibernate - The granpappy of the built-in hibernation management
+ */
+
+int hibernate(void)
+{
+ int error;
+
+ mutex_lock(&pm_mutex);
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+
+ pm_prepare_console();
+ error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ if (error)
+ goto Exit;
+
+ error = usermodehelper_disable();
+ if (error)
+ goto Exit;
+
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Exit;
+
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ error = prepare_processes();
+ if (error)
+ goto Finish;
+
+ if (hibernation_test(TEST_FREEZER))
+ goto Thaw;
+
+ if (hibernation_testmode(HIBERNATION_TESTPROC))
+ goto Thaw;
+
+ error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+ if (in_suspend && !error) {
+ unsigned int flags = 0;
+
+ if (hibernation_mode == HIBERNATION_PLATFORM)
+ flags |= SF_PLATFORM_MODE;
+ pr_debug("PM: writing image.\n");
+ error = swsusp_write(flags);
+ swsusp_free();
+ if (!error)
+ power_down();
+ } else {
+ pr_debug("PM: Image restored successfully.\n");
+ swsusp_free();
+ }
+ Thaw:
+ thaw_processes();
+ Finish:
+ free_basic_memory_bitmaps();
+ usermodehelper_enable();
+ Exit:
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ pm_restore_console();
+ atomic_inc(&snapshot_device_available);
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ return error;
+}
+
+
+/**
+ * software_resume - Resume from a saved image.
+ *
+ * Called as a late_initcall (so all devices are discovered and
+ * initialized), we call swsusp to see if we have a saved image or not.
+ * If so, we quiesce devices, the restore the saved image. We will
+ * return above (in hibernate() ) if everything goes well.
+ * Otherwise, we fail gracefully and return to the normally
+ * scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+ int error;
+ unsigned int flags;
+
+ /*
+ * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
+ * is configured into the kernel. Since the regular hibernate
+ * trigger path is via sysfs which takes a buffer mutex before
+ * calling hibernate functions (which take pm_mutex) this can
+ * cause lockdep to complain about a possible ABBA deadlock
+ * which cannot happen since we're in the boot code here and
+ * sysfs can't be invoked yet. Therefore, we use a subclass
+ * here to avoid lockdep complaining.
+ */
+ mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+ if (!swsusp_resume_device) {
+ if (!strlen(resume_file)) {
+ mutex_unlock(&pm_mutex);
+ return -ENOENT;
+ }
+ swsusp_resume_device = name_to_dev_t(resume_file);
+ pr_debug("PM: Resume from partition %s\n", resume_file);
+ } else {
+ pr_debug("PM: Resume from partition %d:%d\n",
+ MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
+ }
+
+ if (noresume) {
+ /**
+ * FIXME: If noresume is specified, we need to find the
+ * partition and reset it back to normal swap space.
+ */
+ mutex_unlock(&pm_mutex);
+ return 0;
+ }
+
+ pr_debug("PM: Checking hibernation image.\n");
+ error = swsusp_check();
+ if (error)
+ goto Unlock;
+
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+
+ pm_prepare_console();
+ error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (error)
+ goto Finish;
+
+ error = usermodehelper_disable();
+ if (error)
+ goto Finish;
+
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Finish;
+
+ pr_debug("PM: Preparing processes for restore.\n");
+ error = prepare_processes();
+ if (error) {
+ swsusp_close(FMODE_READ);
+ goto Done;
+ }
+
+ pr_debug("PM: Reading hibernation image.\n");
+
+ error = swsusp_read(&flags);
+ if (!error)
+ hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ printk(KERN_ERR "PM: Restore failed, recovering.\n");
+ swsusp_free();
+ thaw_processes();
+ Done:
+ free_basic_memory_bitmaps();
+ usermodehelper_enable();
+ Finish:
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ pm_restore_console();
+ atomic_inc(&snapshot_device_available);
+ /* For success case, the suspend path will release the lock */
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ pr_debug("PM: Resume from disk failed.\n");
+ return error;
+}
+
+late_initcall(software_resume);
+
+
+static const char * const hibernation_modes[] = {
+ [HIBERNATION_PLATFORM] = "platform",
+ [HIBERNATION_SHUTDOWN] = "shutdown",
+ [HIBERNATION_REBOOT] = "reboot",
+ [HIBERNATION_TEST] = "test",
+ [HIBERNATION_TESTPROC] = "testproc",
+};
+
+/**
+ * disk - Control hibernation mode
+ *
+ * Suspend-to-disk can be handled in several ways. We have a few options
+ * for putting the system to sleep - using the platform driver (e.g. ACPI
+ * or other hibernation_ops), powering off the system or rebooting the
+ * system (for testing) as well as the two test modes.
+ *
+ * The system can support 'platform', and that is known a priori (and
+ * encoded by the presence of hibernation_ops). However, the user may
+ * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * test modes, 'test' or 'testproc'.
+ *
+ * show() will display what the mode is currently set to.
+ * store() will accept one of
+ *
+ * 'platform'
+ * 'shutdown'
+ * 'reboot'
+ * 'test'
+ * 'testproc'
+ *
+ * It will only change to 'platform' if the system
+ * supports it (as determined by having hibernation_ops).
+ */
+
+static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int i;
+ char *start = buf;
+
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!hibernation_modes[i])
+ continue;
+ switch (i) {
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ break;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ break;
+ /* not a valid mode, continue with loop */
+ continue;
+ }
+ if (i == hibernation_mode)
+ buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+ else
+ buf += sprintf(buf, "%s ", hibernation_modes[i]);
+ }
+ buf += sprintf(buf, "\n");
+ return buf-start;
+}
+
+
+static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = 0;
+ int i;
+ int len;
+ char *p;
+ int mode = HIBERNATION_INVALID;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ mutex_lock(&pm_mutex);
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (len == strlen(hibernation_modes[i])
+ && !strncmp(buf, hibernation_modes[i], len)) {
+ mode = i;
+ break;
+ }
+ }
+ if (mode != HIBERNATION_INVALID) {
+ switch (mode) {
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ hibernation_mode = mode;
+ break;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ hibernation_mode = mode;
+ else
+ error = -EINVAL;
+ }
+ } else
+ error = -EINVAL;
+
+ if (!error)
+ pr_debug("PM: Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
+ mutex_unlock(&pm_mutex);
+ return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int maj, min;
+ dev_t res;
+ int ret = -EINVAL;
+
+ if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+ goto out;
+
+ res = MKDEV(maj,min);
+ if (maj != MAJOR(res) || min != MINOR(res))
+ goto out;
+
+ mutex_lock(&pm_mutex);
+ swsusp_resume_device = res;
+ mutex_unlock(&pm_mutex);
+ printk(KERN_INFO "PM: Starting manual resume from disk\n");
+ noresume = 0;
+ software_resume();
+ ret = n;
+ out:
+ return ret;
+}
+
+power_attr(resume);
+
+static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", image_size);
+}
+
+static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long size;
+
+ if (sscanf(buf, "%lu", &size) == 1) {
+ image_size = size;
+ return n;
+ }
+
+ return -EINVAL;
+}
+
+power_attr(image_size);
+
+static struct attribute * g[] = {
+ &disk_attr.attr,
+ &resume_attr.attr,
+ &image_size_attr.attr,
+ NULL,
+};
+
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+ return sysfs_create_group(power_kobj, &attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+ if (noresume)
+ return 1;
+
+ strncpy( resume_file, str, 255 );
+ return 1;
+}
+
+static int __init resume_offset_setup(char *str)
+{
+ unsigned long long offset;
+
+ if (noresume)
+ return 1;
+
+ if (sscanf(str, "%llu", &offset) == 1)
+ swsusp_resume_block = offset;
+
+ return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+ noresume = 1;
+ return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
+__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 0000000..b8f7ce9
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,732 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kmod.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/resume-trace.h>
+#include <linux/freezer.h>
+#include <linux/vmstat.h>
+#include <linux/syscalls.h>
+#include <linux/ftrace.h>
+
+#include "power.h"
+
+DEFINE_MUTEX(pm_mutex);
+
+unsigned int pm_flags;
+EXPORT_SYMBOL(pm_flags);
+
+#ifdef CONFIG_PM_SLEEP
+
+/* Routines for PM-transition notifications */
+
+static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
+
+int register_pm_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&pm_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(register_pm_notifier);
+
+int unregister_pm_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&pm_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_pm_notifier);
+
+int pm_notifier_call_chain(unsigned long val)
+{
+ return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
+ == NOTIFY_BAD) ? -EINVAL : 0;
+}
+
+#ifdef CONFIG_PM_DEBUG
+int pm_test_level = TEST_NONE;
+
+static int suspend_test(int level)
+{
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ return 1;
+ }
+ return 0;
+}
+
+static const char * const pm_tests[__TEST_AFTER_LAST] = {
+ [TEST_NONE] = "none",
+ [TEST_CORE] = "core",
+ [TEST_CPUS] = "processors",
+ [TEST_PLATFORM] = "platform",
+ [TEST_DEVICES] = "devices",
+ [TEST_FREEZER] = "freezer",
+};
+
+static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+ int level;
+
+ for (level = TEST_FIRST; level <= TEST_MAX; level++)
+ if (pm_tests[level]) {
+ if (level == pm_test_level)
+ s += sprintf(s, "[%s] ", pm_tests[level]);
+ else
+ s += sprintf(s, "%s ", pm_tests[level]);
+ }
+
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+
+ return (s - buf);
+}
+
+static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ const char * const *s;
+ int level;
+ char *p;
+ int len;
+ int error = -EINVAL;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ mutex_lock(&pm_mutex);
+
+ level = TEST_FIRST;
+ for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
+ if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+ pm_test_level = level;
+ error = 0;
+ break;
+ }
+
+ mutex_unlock(&pm_mutex);
+
+ return error ? error : n;
+}
+
+power_attr(pm_test);
+#else /* !CONFIG_PM_DEBUG */
+static inline int suspend_test(int level) { return 0; }
+#endif /* !CONFIG_PM_DEBUG */
+
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_SUSPEND
+
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending. Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS 5
+
+static unsigned long suspend_test_start_time;
+
+static void suspend_test_start(void)
+{
+ /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+ * What we want is a hardware counter that will work correctly even
+ * during the irqs-are-off stages of the suspend/resume cycle...
+ */
+ suspend_test_start_time = jiffies;
+}
+
+static void suspend_test_finish(const char *label)
+{
+ long nj = jiffies - suspend_test_start_time;
+ unsigned msec;
+
+ msec = jiffies_to_msecs(abs(nj));
+ pr_info("PM: %s took %d.%03d seconds\n", label,
+ msec / 1000, msec % 1000);
+
+ /* Warning on suspend means the RTC alarm period needs to be
+ * larger -- the system was sooo slooowwww to suspend that the
+ * alarm (should have) fired before the system went to sleep!
+ *
+ * Warning on either suspend or resume also means the system
+ * has some performance issues. The stack dump of a WARN_ON
+ * is more likely to get the right attention than a printk...
+ */
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+}
+
+#else
+
+static void suspend_test_start(void)
+{
+}
+
+static void suspend_test_finish(const char *label)
+{
+}
+
+#endif
+
+/* This is just an arbitrary number */
+#define FREE_PAGE_NUMBER (100)
+
+static struct platform_suspend_ops *suspend_ops;
+
+/**
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Pointer to ops structure.
+ */
+
+void suspend_set_ops(struct platform_suspend_ops *ops)
+{
+ mutex_lock(&pm_mutex);
+ suspend_ops = ops;
+ mutex_unlock(&pm_mutex);
+}
+
+/**
+ * suspend_valid_only_mem - generic memory-only valid callback
+ *
+ * Platform drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+
+/**
+ * suspend_prepare - Do prep work before entering low-power state.
+ *
+ * This is common code that is called for each state that we're entering.
+ * Run suspend notifiers, allocate a console and stop all processes.
+ */
+static int suspend_prepare(void)
+{
+ int error;
+ unsigned int free_pages;
+
+ if (!suspend_ops || !suspend_ops->enter)
+ return -EPERM;
+
+ pm_prepare_console();
+
+ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ if (error)
+ goto Finish;
+
+ error = usermodehelper_disable();
+ if (error)
+ goto Finish;
+
+ if (suspend_freeze_processes()) {
+ error = -EAGAIN;
+ goto Thaw;
+ }
+
+ free_pages = global_page_state(NR_FREE_PAGES);
+ if (free_pages < FREE_PAGE_NUMBER) {
+ pr_debug("PM: free some memory\n");
+ shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
+ if (nr_free_pages() < FREE_PAGE_NUMBER) {
+ error = -ENOMEM;
+ printk(KERN_ERR "PM: No enough memory\n");
+ }
+ }
+ if (!error)
+ return 0;
+
+ Thaw:
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ Finish:
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+ return error;
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+ local_irq_disable();
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+ local_irq_enable();
+}
+
+/**
+ * suspend_enter - enter the desired system sleep state.
+ * @state: state to enter
+ *
+ * This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state)
+{
+ int error = 0;
+
+ device_pm_lock();
+ arch_suspend_disable_irqs();
+ BUG_ON(!irqs_disabled());
+
+ if ((error = device_power_down(PMSG_SUSPEND))) {
+ printk(KERN_ERR "PM: Some devices failed to power down\n");
+ goto Done;
+ }
+
+ if (!suspend_test(TEST_CORE))
+ error = suspend_ops->enter(state);
+
+ device_power_up(PMSG_RESUME);
+ Done:
+ arch_suspend_enable_irqs();
+ BUG_ON(irqs_disabled());
+ device_pm_unlock();
+ return error;
+}
+
+/**
+ * suspend_devices_and_enter - suspend devices and enter the desired system
+ * sleep state.
+ * @state: state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+ int error, ftrace_save;
+
+ if (!suspend_ops)
+ return -ENOSYS;
+
+ if (suspend_ops->begin) {
+ error = suspend_ops->begin(state);
+ if (error)
+ goto Close;
+ }
+ suspend_console();
+ ftrace_save = __ftrace_enabled_save();
+ suspend_test_start();
+ error = device_suspend(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to suspend\n");
+ goto Recover_platform;
+ }
+ suspend_test_finish("suspend devices");
+ if (suspend_test(TEST_DEVICES))
+ goto Recover_platform;
+
+ if (suspend_ops->prepare) {
+ error = suspend_ops->prepare();
+ if (error)
+ goto Resume_devices;
+ }
+
+ if (suspend_test(TEST_PLATFORM))
+ goto Finish;
+
+ error = disable_nonboot_cpus();
+ if (!error && !suspend_test(TEST_CPUS))
+ suspend_enter(state);
+
+ enable_nonboot_cpus();
+ Finish:
+ if (suspend_ops->finish)
+ suspend_ops->finish();
+ Resume_devices:
+ suspend_test_start();
+ device_resume(PMSG_RESUME);
+ suspend_test_finish("resume devices");
+ __ftrace_enabled_restore(ftrace_save);
+ resume_console();
+ Close:
+ if (suspend_ops->end)
+ suspend_ops->end();
+ return error;
+
+ Recover_platform:
+ if (suspend_ops->recover)
+ suspend_ops->recover();
+ goto Resume_devices;
+}
+
+/**
+ * suspend_finish - Do final work before exiting suspend sequence.
+ *
+ * Call platform code to clean up, restart processes, and free the
+ * console that we've allocated. This is not called for suspend-to-disk.
+ */
+static void suspend_finish(void)
+{
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+}
+
+
+
+
+static const char * const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
+
+static inline int valid_state(suspend_state_t state)
+{
+ /* All states need lowlevel support and need to be valid
+ * to the lowlevel implementation, no valid callback
+ * implies that none are valid. */
+ if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
+ return 0;
+ return 1;
+}
+
+
+/**
+ * enter_state - Do common work of entering low-power state.
+ * @state: pm_state structure for state we're entering.
+ *
+ * Make sure we're the only ones trying to enter a sleep state. Fail
+ * if someone has beat us to it, since we don't want anything weird to
+ * happen when we wake up.
+ * Then, do the setup for suspend, enter the state, and cleaup (after
+ * we've woken up).
+ */
+static int enter_state(suspend_state_t state)
+{
+ int error;
+
+ if (!valid_state(state))
+ return -ENODEV;
+
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ error = suspend_prepare();
+ if (error)
+ goto Unlock;
+
+ if (suspend_test(TEST_FREEZER))
+ goto Finish;
+
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ error = suspend_devices_and_enter(state);
+
+ Finish:
+ pr_debug("PM: Finishing wakeup.\n");
+ suspend_finish();
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ return error;
+}
+
+
+/**
+ * pm_suspend - Externally visible function for suspending system.
+ * @state: Enumerated value of state to enter.
+ *
+ * Determine whether or not value is within range, get state
+ * structure, and enter (above).
+ */
+
+int pm_suspend(suspend_state_t state)
+{
+ if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+ return enter_state(state);
+ return -EINVAL;
+}
+
+EXPORT_SYMBOL(pm_suspend);
+
+#endif /* CONFIG_SUSPEND */
+
+struct kobject *power_kobj;
+
+/**
+ * state - control system power state.
+ *
+ * show() returns what states are supported, which is hard-coded to
+ * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
+ * 'disk' (Suspend-to-Disk).
+ *
+ * store() accepts one of those strings, translates it into the
+ * proper enumerated value, and initiates a suspend transition.
+ */
+
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+#ifdef CONFIG_SUSPEND
+ int i;
+
+ for (i = 0; i < PM_SUSPEND_MAX; i++) {
+ if (pm_states[i] && valid_state(i))
+ s += sprintf(s,"%s ", pm_states[i]);
+ }
+#endif
+#ifdef CONFIG_HIBERNATION
+ s += sprintf(s, "%s\n", "disk");
+#else
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+#endif
+ return (s - buf);
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+#ifdef CONFIG_SUSPEND
+ suspend_state_t state = PM_SUSPEND_STANDBY;
+ const char * const *s;
+#endif
+ char *p;
+ int len;
+ int error = -EINVAL;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ /* First, check if we are requested to hibernate */
+ if (len == 4 && !strncmp(buf, "disk", len)) {
+ error = hibernate();
+ goto Exit;
+ }
+
+#ifdef CONFIG_SUSPEND
+ for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+ if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+ break;
+ }
+ if (state < PM_SUSPEND_MAX && *s)
+ error = enter_state(state);
+#endif
+
+ Exit:
+ return error ? error : n;
+}
+
+power_attr(state);
+
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int val;
+
+ if (sscanf(buf, "%d", &val) == 1) {
+ pm_trace_enabled = !!val;
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(pm_trace);
+#endif /* CONFIG_PM_TRACE */
+
+static struct attribute * g[] = {
+ &state_attr.attr,
+#ifdef CONFIG_PM_TRACE
+ &pm_trace_attr.attr,
+#endif
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+ &pm_test_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+
+static int __init pm_init(void)
+{
+ power_kobj = kobject_create_and_add("power", NULL);
+ if (!power_kobj)
+ return -ENOMEM;
+ return sysfs_create_group(power_kobj, &attr_group);
+}
+
+core_initcall(pm_init);
+
+
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+#include <linux/rtc.h>
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system. RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+ static char err_readtime[] __initdata =
+ KERN_ERR "PM: can't read %s time, err %d\n";
+ static char err_wakealarm [] __initdata =
+ KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+ static char err_suspend[] __initdata =
+ KERN_ERR "PM: suspend test failed, error %d\n";
+ static char info_test[] __initdata =
+ KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+ unsigned long now;
+ struct rtc_wkalrm alm;
+ int status;
+
+ /* this may fail if the RTC hasn't been initialized */
+ status = rtc_read_time(rtc, &alm.time);
+ if (status < 0) {
+ printk(err_readtime, rtc->dev.bus_id, status);
+ return;
+ }
+ rtc_tm_to_time(&alm.time, &now);
+
+ memset(&alm, 0, sizeof alm);
+ rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ alm.enabled = true;
+
+ status = rtc_set_alarm(rtc, &alm);
+ if (status < 0) {
+ printk(err_wakealarm, rtc->dev.bus_id, status);
+ return;
+ }
+
+ if (state == PM_SUSPEND_MEM) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ if (status == -ENODEV)
+ state = PM_SUSPEND_STANDBY;
+ }
+ if (state == PM_SUSPEND_STANDBY) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+ if (status < 0)
+ printk(err_suspend, status);
+
+ /* Some platforms can't detect that the alarm triggered the
+ * wakeup, or (accordingly) disable it after it afterwards.
+ * It's supposed to give oneshot behavior; cope.
+ */
+ alm.enabled = false;
+ rtc_set_alarm(rtc, &alm);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(char **)name_ptr = dev->bus_id;
+ return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time. They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+ KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+ unsigned i;
+
+ /* "=mem" ==> "mem" */
+ value++;
+ for (i = 0; i < PM_SUSPEND_MAX; i++) {
+ if (!pm_states[i])
+ continue;
+ if (strcmp(pm_states[i], value) != 0)
+ continue;
+ test_state = (__force suspend_state_t) i;
+ return 0;
+ }
+ printk(warn_bad_state, value);
+ return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+ static char warn_no_rtc[] __initdata =
+ KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+ char *pony = NULL;
+ struct rtc_device *rtc = NULL;
+
+ /* PM is initialized by now; is that state testable? */
+ if (test_state == PM_SUSPEND_ON)
+ goto done;
+ if (!valid_state(test_state)) {
+ printk(warn_bad_state, pm_states[test_state]);
+ goto done;
+ }
+
+ /* RTCs have initialized by now too ... can we use one? */
+ class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+ if (pony)
+ rtc = rtc_class_open(pony);
+ if (!rtc) {
+ printk(warn_no_rtc);
+ goto done;
+ }
+
+ /* go for it */
+ test_wakealarm(rtc, test_state);
+ rtc_class_close(rtc);
+done:
+ return 0;
+}
+late_initcall(test_suspend);
+
+#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 0000000..46b5ec7
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,225 @@
+#include <linux/suspend.h>
+#include <linux/suspend_ioctls.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+
+struct swsusp_info {
+ struct new_utsname uts;
+ u32 version_code;
+ unsigned long num_physpages;
+ int cpus;
+ unsigned long image_pages;
+ unsigned long pages;
+ unsigned long size;
+} __attribute__((aligned(PAGE_SIZE)));
+
+#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ARCH_HIBERNATION_HEADER
+/* Maximum size of architecture specific data in a hibernation header */
+#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
+
+extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
+extern int arch_hibernation_header_restore(void *addr);
+
+static inline int init_header_complete(struct swsusp_info *info)
+{
+ return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
+}
+
+static inline char *check_image_kernel(struct swsusp_info *info)
+{
+ return arch_hibernation_header_restore(info) ?
+ "architecture specific data" : NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+
+/*
+ * Keep some memory free so that I/O operations can succeed without paging
+ * [Might this be more than 4 MB?]
+ */
+#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+
+/*
+ * Keep 1 MB of memory free so that device drivers can allocate some pages in
+ * their .suspend() routines without breaking the suspend to disk.
+ */
+#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+
+/* kernel/power/disk.c */
+extern int hibernation_snapshot(int platform_mode);
+extern int hibernation_restore(int platform_mode);
+extern int hibernation_platform_enter(void);
+#endif
+
+extern int pfn_is_nosave(unsigned long);
+
+#define power_attr(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = 0644, \
+ }, \
+ .show = _name##_show, \
+ .store = _name##_store, \
+}
+
+/* Preferred image size in bytes (default 500 MB) */
+extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int create_basic_memory_bitmaps(void);
+extern void free_basic_memory_bitmaps(void);
+extern unsigned int count_data_pages(void);
+
+/**
+ * Auxiliary structure used for reading the snapshot image data and
+ * metadata from and writing them to the list of page backup entries
+ * (PBEs) which is the main data structure of swsusp.
+ *
+ * Using struct snapshot_handle we can transfer the image, including its
+ * metadata, as a continuous sequence of bytes with the help of
+ * snapshot_read_next() and snapshot_write_next().
+ *
+ * The code that writes the image to a storage or transfers it to
+ * the user land is required to use snapshot_read_next() for this
+ * purpose and it should not make any assumptions regarding the internal
+ * structure of the image. Similarly, the code that reads the image from
+ * a storage or transfers it from the user land is required to use
+ * snapshot_write_next().
+ *
+ * This may allow us to change the internal structure of the image
+ * in the future with considerably less effort.
+ */
+
+struct snapshot_handle {
+ loff_t offset; /* number of the last byte ready for reading
+ * or writing in the sequence
+ */
+ unsigned int cur; /* number of the block of PAGE_SIZE bytes the
+ * next operation will refer to (ie. current)
+ */
+ unsigned int cur_offset; /* offset with respect to the current
+ * block (for the next operation)
+ */
+ unsigned int prev; /* number of the block of PAGE_SIZE bytes that
+ * was the current one previously
+ */
+ void *buffer; /* address of the block to read from
+ * or write to
+ */
+ unsigned int buf_offset; /* location to read from or write to,
+ * given as a displacement from 'buffer'
+ */
+ int sync_read; /* Set to one to notify the caller of
+ * snapshot_write_next() that it may
+ * need to call wait_on_bio_chain()
+ */
+};
+
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
+#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+
+extern unsigned int snapshot_additional_pages(struct zone *zone);
+extern unsigned long snapshot_get_image_size(void);
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+
+/* If unset, the snapshot device cannot be open. */
+extern atomic_t snapshot_device_available;
+
+extern sector_t alloc_swapdev_block(int swap);
+extern void free_all_swap_pages(int swap);
+extern int swsusp_swap_in_use(void);
+
+/*
+ * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
+ * the image header.
+ */
+#define SF_PLATFORM_MODE 1
+
+/* kernel/power/disk.c */
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
+extern int swsusp_read(unsigned int *flags_p);
+extern int swsusp_write(unsigned int flags);
+extern void swsusp_close(fmode_t);
+
+struct timeval;
+/* kernel/power/swsusp.c */
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+ unsigned int, char *);
+
+#ifdef CONFIG_SUSPEND
+/* kernel/power/main.c */
+extern int suspend_devices_and_enter(suspend_state_t state);
+#else /* !CONFIG_SUSPEND */
+static inline int suspend_devices_and_enter(suspend_state_t state)
+{
+ return -ENOSYS;
+}
+#endif /* !CONFIG_SUSPEND */
+
+#ifdef CONFIG_PM_SLEEP
+/* kernel/power/main.c */
+extern int pm_notifier_call_chain(unsigned long val);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
+int restore_highmem(void);
+#else
+static inline unsigned int count_highmem_pages(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
+#endif
+
+/*
+ * Suspend test levels
+ */
+enum {
+ /* keep first */
+ TEST_NONE,
+ TEST_CORE,
+ TEST_CPUS,
+ TEST_PLATFORM,
+ TEST_DEVICES,
+ TEST_FREEZER,
+ /* keep last */
+ __TEST_AFTER_LAST
+};
+
+#define TEST_FIRST TEST_NONE
+#define TEST_MAX (__TEST_AFTER_LAST - 1)
+
+extern int pm_test_level;
+
+#ifdef CONFIG_SUSPEND_FREEZER
+static inline int suspend_freeze_processes(void)
+{
+ return freeze_processes();
+}
+
+static inline void suspend_thaw_processes(void)
+{
+ thaw_processes();
+}
+#else
+static inline int suspend_freeze_processes(void)
+{
+ return 0;
+}
+
+static inline void suspend_thaw_processes(void)
+{
+}
+#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 0000000..72016f0
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,46 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+#include <linux/reboot.h>
+#include <linux/cpumask.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(struct work_struct *dummy)
+{
+ kernel_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff);
+
+static void handle_poweroff(int key, struct tty_struct *tty)
+{
+ /* run sysrq poweroff on boot cpu */
+ schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
+}
+
+static struct sysrq_key_op sysrq_poweroff_op = {
+ .handler = handle_poweroff,
+ .help_msg = "powerOff",
+ .action_msg = "Power Off",
+ .enable_mask = SYSRQ_ENABLE_BOOT,
+};
+
+static int pm_sysrq_init(void)
+{
+ register_sysrq_key('o', &sysrq_poweroff_op);
+ return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 0000000..ca63401
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,154 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on
+ * suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+
+/*
+ * Timeout for stopping processes
+ */
+#define TIMEOUT (20 * HZ)
+
+static inline int freezeable(struct task_struct * p)
+{
+ if ((p == current) ||
+ (p->flags & PF_NOFREEZE) ||
+ (p->exit_state != 0))
+ return 0;
+ return 1;
+}
+
+static int try_to_freeze_tasks(bool sig_only)
+{
+ struct task_struct *g, *p;
+ unsigned long end_time;
+ unsigned int todo;
+ struct timeval start, end;
+ u64 elapsed_csecs64;
+ unsigned int elapsed_csecs;
+
+ do_gettimeofday(&start);
+
+ end_time = jiffies + TIMEOUT;
+ do {
+ todo = 0;
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (frozen(p) || !freezeable(p))
+ continue;
+
+ if (!freeze_task(p, sig_only))
+ continue;
+
+ /*
+ * Now that we've done set_freeze_flag, don't
+ * perturb a task in TASK_STOPPED or TASK_TRACED.
+ * It is "frozen enough". If the task does wake
+ * up, it will immediately call try_to_freeze.
+ */
+ if (!task_is_stopped_or_traced(p) &&
+ !freezer_should_skip(p))
+ todo++;
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ yield(); /* Yield is okay here */
+ if (time_after(jiffies, end_time))
+ break;
+ } while (todo);
+
+ do_gettimeofday(&end);
+ elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
+ do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
+ elapsed_csecs = elapsed_csecs64;
+
+ if (todo) {
+ /* This does not unfreeze processes that are already frozen
+ * (we have slightly ugly calling convention in that respect,
+ * and caller must call thaw_processes() if something fails),
+ * but it cleans up leftover PF_FREEZE requests.
+ */
+ printk("\n");
+ printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+ "(%d tasks refusing to freeze):\n",
+ elapsed_csecs / 100, elapsed_csecs % 100, todo);
+ show_state();
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_lock(p);
+ if (freezing(p) && !freezer_should_skip(p))
+ printk(KERN_ERR " %s\n", p->comm);
+ cancel_freezing(p);
+ task_unlock(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ } else {
+ printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
+ elapsed_csecs % 100);
+ }
+
+ return todo ? -EBUSY : 0;
+}
+
+/**
+ * freeze_processes - tell processes to enter the refrigerator
+ */
+int freeze_processes(void)
+{
+ int error;
+
+ printk("Freezing user space processes ... ");
+ error = try_to_freeze_tasks(true);
+ if (error)
+ goto Exit;
+ printk("done.\n");
+
+ printk("Freezing remaining freezable tasks ... ");
+ error = try_to_freeze_tasks(false);
+ if (error)
+ goto Exit;
+ printk("done.");
+ Exit:
+ BUG_ON(in_atomic());
+ printk("\n");
+ return error;
+}
+
+static void thaw_tasks(bool nosig_only)
+{
+ struct task_struct *g, *p;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!freezeable(p))
+ continue;
+
+ if (nosig_only && should_send_signal(p))
+ continue;
+
+ if (cgroup_frozen(p))
+ continue;
+
+ thaw_process(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+}
+
+void thaw_processes(void)
+{
+ printk("Restarting tasks ... ");
+ thaw_tasks(true);
+ thaw_tasks(false);
+ schedule();
+ printk("done.\n");
+}
+
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 0000000..5d2ab83
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,1974 @@
+/*
+ * linux/kernel/power/snapshot.c
+ *
+ * This file provides system snapshot/restore functionality for swsusp.
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "power.h"
+
+static int swsusp_page_is_free(struct page *);
+static void swsusp_set_page_forbidden(struct page *);
+static void swsusp_unset_page_forbidden(struct page *);
+
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
+struct pbe *restore_pblist;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
+
+/**
+ * @safe_needed - on resume, for storing the PBE list and the image,
+ * we can only use memory pages that do not conflict with the pages
+ * used before suspend. The unsafe pages have PageNosaveFree set
+ * and we count them using unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree
+ * so that swsusp_free() can release it.
+ */
+
+#define PG_ANY 0
+#define PG_SAFE 1
+#define PG_UNSAFE_CLEAR 1
+#define PG_UNSAFE_KEEP 0
+
+static unsigned int allocated_unsafe_pages;
+
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
+{
+ void *res;
+
+ res = (void *)get_zeroed_page(gfp_mask);
+ if (safe_needed)
+ while (res && swsusp_page_is_free(virt_to_page(res))) {
+ /* The page is unsafe, mark it for swsusp_free() */
+ swsusp_set_page_forbidden(virt_to_page(res));
+ allocated_unsafe_pages++;
+ res = (void *)get_zeroed_page(gfp_mask);
+ }
+ if (res) {
+ swsusp_set_page_forbidden(virt_to_page(res));
+ swsusp_set_page_free(virt_to_page(res));
+ }
+ return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+ return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
+ struct page *page;
+
+ page = alloc_page(gfp_mask);
+ if (page) {
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
+ }
+ return page;
+}
+
+/**
+ * free_image_page - free page represented by @addr, allocated with
+ * get_image_page (page flags set by it must be cleared)
+ */
+
+static inline void free_image_page(void *addr, int clear_nosave_free)
+{
+ struct page *page;
+
+ BUG_ON(!virt_addr_valid(addr));
+
+ page = virt_to_page(addr);
+
+ swsusp_unset_page_forbidden(page);
+ if (clear_nosave_free)
+ swsusp_unset_page_free(page);
+
+ __free_page(page);
+}
+
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
+
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+{
+ while (list) {
+ struct linked_page *lp = list->next;
+
+ free_image_page(list, clear_page_nosave);
+ list = lp;
+ }
+}
+
+/**
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
+
+struct chain_allocator {
+ struct linked_page *chain; /* the chain */
+ unsigned int used_space; /* total size of objects allocated out
+ * of the current page
+ */
+ gfp_t gfp_mask; /* mask for allocating pages */
+ int safe_needed; /* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+{
+ ca->chain = NULL;
+ ca->used_space = LINKED_PAGE_DATA_SIZE;
+ ca->gfp_mask = gfp_mask;
+ ca->safe_needed = safe_needed;
+}
+
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+{
+ void *ret;
+
+ if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+ struct linked_page *lp;
+
+ lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ if (!lp)
+ return NULL;
+
+ lp->next = ca->chain;
+ ca->chain = lp;
+ ca->used_space = 0;
+ }
+ ret = ca->chain->data + ca->used_space;
+ ca->used_space += size;
+ return ret;
+}
+
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
+{
+ free_list_of_pages(ca->chain, clear_page_nosave);
+ memset(ca, 0, sizeof(struct chain_allocator));
+}
+
+/**
+ * Data types related to memory bitmaps.
+ *
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
+ *
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
+ *
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * pfns that correspond to the start and end of the represented zone.
+ *
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
+ */
+
+#define BM_END_OF_MAP (~0UL)
+
+#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
+
+struct bm_block {
+ struct bm_block *next; /* next element of the list */
+ unsigned long start_pfn; /* pfn represented by the first bit */
+ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
+ unsigned long *data; /* bitmap representing pages */
+};
+
+static inline unsigned long bm_block_bits(struct bm_block *bb)
+{
+ return bb->end_pfn - bb->start_pfn;
+}
+
+struct zone_bitmap {
+ struct zone_bitmap *next; /* next element of the list */
+ unsigned long start_pfn; /* minimal pfn in this zone */
+ unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
+ struct bm_block *bm_blocks; /* list of bitmap blocks */
+ struct bm_block *cur_block; /* recently used bitmap block */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+ struct zone_bitmap *zone_bm;
+ struct bm_block *block;
+ int bit;
+};
+
+struct memory_bitmap {
+ struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
+ struct linked_page *p_list; /* list of pages used to store zone
+ * bitmap objects and bitmap block
+ * objects
+ */
+ struct bm_position cur; /* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ struct zone_bitmap *zone_bm;
+
+ zone_bm = bm->zone_bm_list;
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ bm->cur.bit = 0;
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
+/**
+ * create_bm_block_list - create a list of block bitmap objects
+ */
+
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+{
+ struct bm_block *bblist = NULL;
+
+ while (nr_blocks-- > 0) {
+ struct bm_block *bb;
+
+ bb = chain_alloc(ca, sizeof(struct bm_block));
+ if (!bb)
+ return NULL;
+
+ bb->next = bblist;
+ bblist = bb;
+ }
+ return bblist;
+}
+
+/**
+ * create_zone_bm_list - create a list of zone bitmap objects
+ */
+
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
+{
+ struct zone_bitmap *zbmlist = NULL;
+
+ while (nr_zones-- > 0) {
+ struct zone_bitmap *zbm;
+
+ zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+ if (!zbm)
+ return NULL;
+
+ zbm->next = zbmlist;
+ zbmlist = zbm;
+ }
+ return zbmlist;
+}
+
+/**
+ * memory_bm_create - allocate memory for a memory bitmap
+ */
+
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+ struct chain_allocator ca;
+ struct zone *zone;
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ unsigned int nr;
+
+ chain_init(&ca, gfp_mask, safe_needed);
+
+ /* Compute the number of zones */
+ nr = 0;
+ for_each_zone(zone)
+ if (populated_zone(zone))
+ nr++;
+
+ /* Allocate the list of zones bitmap objects */
+ zone_bm = create_zone_bm_list(nr, &ca);
+ bm->zone_bm_list = zone_bm;
+ if (!zone_bm) {
+ chain_free(&ca, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
+ }
+
+ /* Initialize the zone bitmap objects */
+ for_each_zone(zone) {
+ unsigned long pfn;
+
+ if (!populated_zone(zone))
+ continue;
+
+ zone_bm->start_pfn = zone->zone_start_pfn;
+ zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ /* Allocate the list of bitmap block objects */
+ nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ bb = create_bm_block_list(nr, &ca);
+ zone_bm->bm_blocks = bb;
+ zone_bm->cur_block = bb;
+ if (!bb)
+ goto Free;
+
+ nr = zone->spanned_pages;
+ pfn = zone->zone_start_pfn;
+ /* Initialize the bitmap block objects */
+ while (bb) {
+ unsigned long *ptr;
+
+ ptr = get_image_page(gfp_mask, safe_needed);
+ bb->data = ptr;
+ if (!ptr)
+ goto Free;
+
+ bb->start_pfn = pfn;
+ if (nr >= BM_BITS_PER_BLOCK) {
+ pfn += BM_BITS_PER_BLOCK;
+ nr -= BM_BITS_PER_BLOCK;
+ } else {
+ /* This is executed only once in the loop */
+ pfn += nr;
+ }
+ bb->end_pfn = pfn;
+ bb = bb->next;
+ }
+ zone_bm = zone_bm->next;
+ }
+ bm->p_list = ca.chain;
+ memory_bm_position_reset(bm);
+ return 0;
+
+ Free:
+ bm->p_list = ca.chain;
+ memory_bm_free(bm, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
+}
+
+/**
+ * memory_bm_free - free memory occupied by the memory bitmap @bm
+ */
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+ struct zone_bitmap *zone_bm;
+
+ /* Free the list of bit blocks for each zone_bitmap object */
+ zone_bm = bm->zone_bm_list;
+ while (zone_bm) {
+ struct bm_block *bb;
+
+ bb = zone_bm->bm_blocks;
+ while (bb) {
+ if (bb->data)
+ free_image_page(bb->data, clear_nosave_free);
+ bb = bb->next;
+ }
+ zone_bm = zone_bm->next;
+ }
+ free_list_of_pages(bm->p_list, clear_nosave_free);
+ bm->zone_bm_list = NULL;
+}
+
+/**
+ * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
+ * to given pfn. The cur_zone_bm member of @bm and the cur_block member
+ * of @bm->cur_zone_bm are updated.
+ */
+
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
+{
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+
+ /* Check if the pfn is from the current zone */
+ zone_bm = bm->cur.zone_bm;
+ if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = bm->zone_bm_list;
+ /* We don't assume that the zones are sorted by pfns */
+ while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = zone_bm->next;
+
+ if (!zone_bm)
+ return -EFAULT;
+ }
+ bm->cur.zone_bm = zone_bm;
+ }
+ /* Check if the pfn corresponds to the current bitmap block */
+ bb = zone_bm->cur_block;
+ if (pfn < bb->start_pfn)
+ bb = zone_bm->bm_blocks;
+
+ while (pfn >= bb->end_pfn) {
+ bb = bb->next;
+
+ BUG_ON(!bb);
+ }
+ zone_bm->cur_block = bb;
+ pfn -= bb->start_pfn;
+ *bit_nr = pfn;
+ *addr = bb->data;
+ return 0;
+}
+
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
+ set_bit(bit, addr);
+}
+
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ if (!error)
+ set_bit(bit, addr);
+ return error;
+}
+
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
+ clear_bit(bit, addr);
+}
+
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
+ return test_bit(bit, addr);
+}
+
+/**
+ * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
+ * returned.
+ *
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function.
+ */
+
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+{
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ int bit;
+
+ do {
+ bb = bm->cur.block;
+ do {
+ bit = bm->cur.bit;
+ bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+ if (bit < bm_block_bits(bb))
+ goto Return_pfn;
+
+ bb = bb->next;
+ bm->cur.block = bb;
+ bm->cur.bit = 0;
+ } while (bb);
+ zone_bm = bm->cur.zone_bm->next;
+ if (zone_bm) {
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ bm->cur.bit = 0;
+ }
+ } while (zone_bm);
+ memory_bm_position_reset(bm);
+ return BM_END_OF_MAP;
+
+ Return_pfn:
+ bm->cur.bit = bit + 1;
+ return bb->start_pfn + bit;
+}
+
+/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
+
+/**
+ * register_nosave_region - register a range of page frames the contents
+ * of which should not be saved during the suspend (to be used in the early
+ * initialization code)
+ */
+
+void __init
+__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
+ int use_kmalloc)
+{
+ struct nosave_region *region;
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ if (!list_empty(&nosave_regions)) {
+ /* Try to extend the previous region (they should be sorted) */
+ region = list_entry(nosave_regions.prev,
+ struct nosave_region, list);
+ if (region->end_pfn == start_pfn) {
+ region->end_pfn = end_pfn;
+ goto Report;
+ }
+ }
+ if (use_kmalloc) {
+ /* during init, this shouldn't fail */
+ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
+ BUG_ON(!region);
+ } else
+ /* This allocation cannot fail */
+ region = alloc_bootmem_low(sizeof(struct nosave_region));
+ region->start_pfn = start_pfn;
+ region->end_pfn = end_pfn;
+ list_add_tail(&region->list, &nosave_regions);
+ Report:
+ printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n",
+ start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+/*
+ * Set bits in this map correspond to the page frames the contents of which
+ * should not be saved during the suspend.
+ */
+static struct memory_bitmap *forbidden_pages_map;
+
+/* Set bits in this map correspond to free page frames. */
+static struct memory_bitmap *free_pages_map;
+
+/*
+ * Each page frame allocated for creating the image is marked by setting the
+ * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
+ */
+
+void swsusp_set_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_set_bit(free_pages_map, page_to_pfn(page));
+}
+
+static int swsusp_page_is_free(struct page *page)
+{
+ return free_pages_map ?
+ memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
+}
+
+void swsusp_unset_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
+}
+
+static void swsusp_set_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+int swsusp_page_is_forbidden(struct page *page)
+{
+ return forbidden_pages_map ?
+ memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
+}
+
+static void swsusp_unset_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+/**
+ * mark_nosave_pages - set bits corresponding to the page frames the
+ * contents of which should not be saved in a given bitmap.
+ */
+
+static void mark_nosave_pages(struct memory_bitmap *bm)
+{
+ struct nosave_region *region;
+
+ if (list_empty(&nosave_regions))
+ return;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ unsigned long pfn;
+
+ pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
+ region->start_pfn << PAGE_SHIFT,
+ region->end_pfn << PAGE_SHIFT);
+
+ for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
+ }
+}
+
+/**
+ * create_basic_memory_bitmaps - create bitmaps needed for marking page
+ * frames that should not be saved and free page frames. The pointers
+ * forbidden_pages_map and free_pages_map are only modified if everything
+ * goes well, because we don't want the bits to be used before both bitmaps
+ * are set up.
+ */
+
+int create_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+ int error = 0;
+
+ BUG_ON(forbidden_pages_map || free_pages_map);
+
+ bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm1)
+ return -ENOMEM;
+
+ error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_first_object;
+
+ bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm2)
+ goto Free_first_bitmap;
+
+ error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_second_object;
+
+ forbidden_pages_map = bm1;
+ free_pages_map = bm2;
+ mark_nosave_pages(forbidden_pages_map);
+
+ pr_debug("PM: Basic memory bitmaps created\n");
+
+ return 0;
+
+ Free_second_object:
+ kfree(bm2);
+ Free_first_bitmap:
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ Free_first_object:
+ kfree(bm1);
+ return -ENOMEM;
+}
+
+/**
+ * free_basic_memory_bitmaps - free memory bitmaps allocated by
+ * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
+ * so that the bitmaps themselves are not referred to while they are being
+ * freed.
+ */
+
+void free_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+
+ BUG_ON(!(forbidden_pages_map && free_pages_map));
+
+ bm1 = forbidden_pages_map;
+ bm2 = free_pages_map;
+ forbidden_pages_map = NULL;
+ free_pages_map = NULL;
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ kfree(bm1);
+ memory_bm_free(bm2, PG_UNSAFE_CLEAR);
+ kfree(bm2);
+
+ pr_debug("PM: Basic memory bitmaps freed\n");
+}
+
+/**
+ * snapshot_additional_pages - estimate the number of additional pages
+ * be needed for setting up the suspend image data structures for given
+ * zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+ unsigned int res;
+
+ res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+ return 2 * res;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_free_highmem_pages - compute the total number of free highmem
+ * pages, system-wide.
+ */
+
+static unsigned int count_free_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int cnt = 0;
+
+ for_each_zone(zone)
+ if (populated_zone(zone) && is_highmem(zone))
+ cnt += zone_page_state(zone, NR_FREE_PAGES);
+
+ return cnt;
+}
+
+/**
+ * saveable_highmem_page - Determine whether a highmem page should be
+ * included in the suspend image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't a part of a free chunk of pages.
+ */
+
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+
+ BUG_ON(!PageHighMem(page));
+
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
+ PageReserved(page))
+ return NULL;
+
+ return page;
+}
+
+/**
+ * count_highmem_pages - compute the total number of saveable highmem
+ * pages.
+ */
+
+unsigned int count_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int n = 0;
+
+ for_each_zone(zone) {
+ unsigned long pfn, max_zone_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_highmem_page(pfn))
+ n++;
+ }
+ return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * saveable_page - Determine whether a non-highmem page should be included
+ * in the suspend image.
+ *
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't a part of
+ * a free chunk of pages.
+ */
+
+static struct page *saveable_page(unsigned long pfn)
+{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+
+ BUG_ON(PageHighMem(page));
+
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page)
+ && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
+ return NULL;
+
+ return page;
+}
+
+/**
+ * count_data_pages - compute the total number of saveable non-highmem
+ * pages.
+ */
+
+unsigned int count_data_pages(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+ unsigned int n = 0;
+
+ for_each_zone(zone) {
+ if (is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if(saveable_page(pfn))
+ n++;
+ }
+ return n;
+}
+
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
+{
+ int n;
+
+ for (n = PAGE_SIZE / sizeof(long); n; n--)
+ *dst++ = *src++;
+}
+
+
+/**
+ * safe_copy_page - check if the page we are going to copy is marked as
+ * present in the kernel page tables (this always is the case if
+ * CONFIG_DEBUG_PAGEALLOC is not set and in that case
+ * kernel_page_present() always returns 'true').
+ */
+static void safe_copy_page(void *dst, struct page *s_page)
+{
+ if (kernel_page_present(s_page)) {
+ do_copy_page(dst, page_address(s_page));
+ } else {
+ kernel_map_pages(s_page, 1, 1);
+ do_copy_page(dst, page_address(s_page));
+ kernel_map_pages(s_page, 1, 0);
+ }
+}
+
+
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+ return is_highmem(zone) ?
+ saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+
+static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ struct page *s_page, *d_page;
+ void *src, *dst;
+
+ s_page = pfn_to_page(src_pfn);
+ d_page = pfn_to_page(dst_pfn);
+ if (PageHighMem(s_page)) {
+ src = kmap_atomic(s_page, KM_USER0);
+ dst = kmap_atomic(d_page, KM_USER1);
+ do_copy_page(dst, src);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ } else {
+ if (PageHighMem(d_page)) {
+ /* Page pointed to by src may contain some kernel
+ * data modified by kmap_atomic()
+ */
+ safe_copy_page(buffer, s_page);
+ dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+ memcpy(dst, buffer, PAGE_SIZE);
+ kunmap_atomic(dst, KM_USER0);
+ } else {
+ safe_copy_page(page_address(d_page), s_page);
+ }
+ }
+}
+#else
+#define page_is_saveable(zone, pfn) saveable_page(pfn)
+
+static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ pfn_to_page(src_pfn));
+}
+#endif /* CONFIG_HIGHMEM */
+
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ for_each_zone(zone) {
+ unsigned long max_zone_pfn;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (page_is_saveable(zone, pfn))
+ memory_bm_set_bit(orig_bm, pfn);
+ }
+ memory_bm_position_reset(orig_bm);
+ memory_bm_position_reset(copy_bm);
+ for(;;) {
+ pfn = memory_bm_next_pfn(orig_bm);
+ if (unlikely(pfn == BM_END_OF_MAP))
+ break;
+ copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
+ }
+}
+
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
+
+/**
+ * swsusp_free - free pages allocated for the suspend.
+ *
+ * Suspend pages are alocated before the atomic copy is made, so we
+ * need to release them after the resume.
+ */
+
+void swsusp_free(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+
+ for_each_zone(zone) {
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (swsusp_page_is_forbidden(page) &&
+ swsusp_page_is_free(page)) {
+ swsusp_unset_page_forbidden(page);
+ swsusp_unset_page_free(page);
+ __free_page(page);
+ }
+ }
+ }
+ nr_copy_pages = 0;
+ nr_meta_pages = 0;
+ restore_pblist = NULL;
+ buffer = NULL;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_pages_for_highmem - compute the number of non-highmem pages
+ * that will be necessary for creating copies of highmem pages.
+ */
+
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+ unsigned int free_highmem = count_free_highmem_pages();
+
+ if (free_highmem >= nr_highmem)
+ nr_highmem = 0;
+ else
+ nr_highmem -= free_highmem;
+
+ return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * enough_free_mem - Make sure we have enough free memory for the
+ * snapshot image.
+ */
+
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
+{
+ struct zone *zone;
+ unsigned int free = 0, meta = 0;
+
+ for_each_zone(zone) {
+ meta += snapshot_additional_pages(zone);
+ if (!is_highmem(zone))
+ free += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ nr_pages += count_pages_for_highmem(nr_highmem);
+ pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, meta, free);
+
+ return free > nr_pages + PAGES_FOR_IO + meta;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * get_highmem_buffer - if there are some highmem pages in the suspend
+ * image, we may need the buffer to copy them and/or load their data.
+ */
+
+static inline int get_highmem_buffer(int safe_needed)
+{
+ buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ return buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * alloc_highmem_image_pages - allocate some highmem pages for the image.
+ * Try to allocate as many pages as needed, but if the number of free
+ * highmem pages is lesser than that, allocate them all.
+ */
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+ unsigned int to_alloc = count_free_highmem_pages();
+
+ if (to_alloc > nr_highmem)
+ to_alloc = nr_highmem;
+
+ nr_highmem -= to_alloc;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_image_page(__GFP_HIGHMEM);
+ memory_bm_set_bit(bm, page_to_pfn(page));
+ }
+ return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * swsusp_alloc - allocate memory for the suspend image
+ *
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
+ *
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
+ */
+
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
+{
+ int error;
+
+ error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
+
+ error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
+
+ if (nr_highmem > 0) {
+ error = get_highmem_buffer(PG_ANY);
+ if (error)
+ goto Free;
+
+ nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+ }
+ while (nr_pages-- > 0) {
+ struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+
+ if (!page)
+ goto Free;
+
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
+ }
+ return 0;
+
+ Free:
+ swsusp_free();
+ return -ENOMEM;
+}
+
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used on suspend for marking allocated pages that will contain
+ * the copies of saveable pages. During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released. Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
+ */
+static struct memory_bitmap copy_bm;
+
+asmlinkage int swsusp_save(void)
+{
+ unsigned int nr_pages, nr_highmem;
+
+ printk(KERN_INFO "PM: Creating hibernation image: \n");
+
+ drain_local_pages(NULL);
+ nr_pages = count_data_pages();
+ nr_highmem = count_highmem_pages();
+ printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+
+ if (!enough_free_mem(nr_pages, nr_highmem)) {
+ printk(KERN_ERR "PM: Not enough free memory\n");
+ return -ENOMEM;
+ }
+
+ if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+ printk(KERN_ERR "PM: Memory allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /* During allocating of suspend pagedir, new cold pages may appear.
+ * Kill them.
+ */
+ drain_local_pages(NULL);
+ copy_data_pages(&copy_bm, &orig_bm);
+
+ /*
+ * End of critical section. From now on, we can write to memory,
+ * but we should not touch disk. This specially means we must _not_
+ * touch swap space! Except we must write out our image of course.
+ */
+
+ nr_pages += nr_highmem;
+ nr_copy_pages = nr_pages;
+ nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
+
+ printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
+ nr_pages);
+
+ return 0;
+}
+
+#ifndef CONFIG_ARCH_HIBERNATION_HEADER
+static int init_header_complete(struct swsusp_info *info)
+{
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
+ info->version_code = LINUX_VERSION_CODE;
+ return 0;
+}
+
+static char *check_image_kernel(struct swsusp_info *info)
+{
+ if (info->version_code != LINUX_VERSION_CODE)
+ return "kernel version";
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ return "system type";
+ if (strcmp(info->uts.release,init_utsname()->release))
+ return "kernel release";
+ if (strcmp(info->uts.version,init_utsname()->version))
+ return "version";
+ if (strcmp(info->uts.machine,init_utsname()->machine))
+ return "machine";
+ return NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+
+unsigned long snapshot_get_image_size(void)
+{
+ return nr_copy_pages + nr_meta_pages + 1;
+}
+
+static int init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
+ info->num_physpages = num_physpages;
+ info->image_pages = nr_copy_pages;
+ info->pages = snapshot_get_image_size();
+ info->size = info->pages;
+ info->size <<= PAGE_SHIFT;
+ return init_header_complete(info);
+}
+
+/**
+ * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ * are stored in the array @buf[] (1 page at a time)
+ */
+
+static inline void
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ buf[j] = memory_bm_next_pfn(bm);
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+ }
+}
+
+/**
+ * snapshot_read_next - used for reading the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * The @count parameter should contain the number of bytes the caller
+ * wants to read from the snapshot. It must not be zero.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro. The number returned
+ * may be smaller than @count, but this only happens if the read would
+ * cross a page boundary otherwise.
+ *
+ * The function returns 0 to indicate the end of data stream condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+ if (handle->cur > nr_meta_pages + nr_copy_pages)
+ return 0;
+
+ if (!buffer) {
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+ if (!buffer)
+ return -ENOMEM;
+ }
+ if (!handle->offset) {
+ int error;
+
+ error = init_header((struct swsusp_info *)buffer);
+ if (error)
+ return error;
+ handle->buffer = buffer;
+ memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&copy_bm);
+ }
+ if (handle->prev < handle->cur) {
+ if (handle->cur <= nr_meta_pages) {
+ memset(buffer, 0, PAGE_SIZE);
+ pack_pfns(buffer, &orig_bm);
+ } else {
+ struct page *page;
+
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+ if (PageHighMem(page)) {
+ /* Highmem pages are copied to the buffer,
+ * because we can't return with a kmapped
+ * highmem page (we may not be called again).
+ */
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(buffer, kaddr, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ handle->buffer = buffer;
+ } else {
+ handle->buffer = page_address(page);
+ }
+ }
+ handle->prev = handle->cur;
+ }
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
+ } else {
+ handle->cur_offset += count;
+ }
+ handle->offset += count;
+ return count;
+}
+
+/**
+ * mark_unsafe_pages - mark the pages that cannot be used for storing
+ * the image during resume, because they conflict with the pages that
+ * had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct memory_bitmap *bm)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+
+ /* Clear page flags */
+ for_each_zone(zone) {
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn))
+ swsusp_unset_page_free(pfn_to_page(pfn));
+ }
+
+ /* Mark pages that correspond to the "original" pfns as "unsafe" */
+ memory_bm_position_reset(bm);
+ do {
+ pfn = memory_bm_next_pfn(bm);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ if (likely(pfn_valid(pfn)))
+ swsusp_set_page_free(pfn_to_page(pfn));
+ else
+ return -EFAULT;
+ }
+ } while (pfn != BM_END_OF_MAP);
+
+ allocated_unsafe_pages = 0;
+
+ return 0;
+}
+
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+{
+ unsigned long pfn;
+
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
+ }
+}
+
+static int check_header(struct swsusp_info *info)
+{
+ char *reason;
+
+ reason = check_image_kernel(info);
+ if (!reason && info->num_physpages != num_physpages)
+ reason = "memory size";
+ if (reason) {
+ printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ return -EPERM;
+ }
+ return 0;
+}
+
+/**
+ * load header - check the image header and copy data from it
+ */
+
+static int
+load_header(struct swsusp_info *info)
+{
+ int error;
+
+ restore_pblist = NULL;
+ error = check_header(info);
+ if (!error) {
+ nr_copy_pages = info->image_pages;
+ nr_meta_pages = info->pages - info->image_pages - 1;
+ }
+ return error;
+}
+
+/**
+ * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ * the corresponding bit in the memory bitmap @bm
+ */
+
+static inline void
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+
+ memory_bm_set_bit(bm, buf[j]);
+ }
+}
+
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+ struct page *copy_page; /* data is here now */
+ struct page *orig_page; /* data was here before the suspend */
+ struct highmem_pbe *next;
+};
+
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+
+/**
+ * count_highmem_image_pages - compute the number of highmem pages in the
+ * suspend image. The bits in the memory bitmap @bm that correspond to the
+ * image pages are assumed to be set.
+ */
+
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+ unsigned long pfn;
+ unsigned int cnt = 0;
+
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (PageHighMem(pfn_to_page(pfn)))
+ cnt++;
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ return cnt;
+}
+
+/**
+ * prepare_highmem_image - try to allocate as many highmem pages as
+ * there are highmem image pages (@nr_highmem_p points to the variable
+ * containing the number of highmem image pages). The pages that are
+ * "safe" (ie. will not be overwritten when the suspend image is
+ * restored) have the corresponding bits set in @bm (it must be
+ * unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem
+ * image pages.
+ */
+
+static unsigned int safe_highmem_pages;
+
+static struct memory_bitmap *safe_highmem_bm;
+
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ unsigned int to_alloc;
+
+ if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+ return -ENOMEM;
+
+ if (get_highmem_buffer(PG_SAFE))
+ return -ENOMEM;
+
+ to_alloc = count_free_highmem_pages();
+ if (to_alloc > *nr_highmem_p)
+ to_alloc = *nr_highmem_p;
+ else
+ *nr_highmem_p = to_alloc;
+
+ safe_highmem_pages = 0;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_page(__GFP_HIGHMEM);
+ if (!swsusp_page_is_free(page)) {
+ /* The page is "safe", set its bit the bitmap */
+ memory_bm_set_bit(bm, page_to_pfn(page));
+ safe_highmem_pages++;
+ }
+ /* Mark the page as allocated */
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
+ }
+ memory_bm_position_reset(bm);
+ safe_highmem_bm = bm;
+ return 0;
+}
+
+/**
+ * get_highmem_page_buffer - for given highmem image page find the buffer
+ * that suspend_write_next() should set for its caller to write to.
+ *
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem page is set to the page to which
+ * the data will have to be copied from @buffer.
+ */
+
+static struct page *last_highmem_page;
+
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ struct highmem_pbe *pbe;
+ void *kaddr;
+
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
+ */
+ last_highmem_page = page;
+ return buffer;
+ }
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
+ */
+ pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+ if (!pbe) {
+ swsusp_free();
+ return NULL;
+ }
+ pbe->orig_page = page;
+ if (safe_highmem_pages > 0) {
+ struct page *tmp;
+
+ /* Copy of the page will be stored in high memory */
+ kaddr = buffer;
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+ safe_highmem_pages--;
+ last_highmem_page = tmp;
+ pbe->copy_page = tmp;
+ } else {
+ /* Copy of the page will be stored in normal memory */
+ kaddr = safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->copy_page = virt_to_page(kaddr);
+ }
+ pbe->next = highmem_pblist;
+ highmem_pblist = pbe;
+ return kaddr;
+}
+
+/**
+ * copy_last_highmem_page - copy the contents of a highmem image from
+ * @buffer, where the caller of snapshot_write_next() has place them,
+ * to the right location represented by @last_highmem_page .
+ */
+
+static void copy_last_highmem_page(void)
+{
+ if (last_highmem_page) {
+ void *dst;
+
+ dst = kmap_atomic(last_highmem_page, KM_USER0);
+ memcpy(dst, buffer, PAGE_SIZE);
+ kunmap_atomic(dst, KM_USER0);
+ last_highmem_page = NULL;
+ }
+}
+
+static inline int last_highmem_page_copied(void)
+{
+ return !last_highmem_page;
+}
+
+static inline void free_highmem_data(void)
+{
+ if (safe_highmem_bm)
+ memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+
+ if (buffer)
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ return 0;
+}
+
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ return NULL;
+}
+
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * prepare_image - use the memory bitmap @bm to mark the pages that will
+ * be overwritten in the process of restoring the system memory state
+ * from the suspend image ("unsafe" pages) and allocate memory for the
+ * image.
+ *
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for the image data, but not to assign these
+ * pages to specific tasks initially. Instead, we just mark them as
+ * allocated and create a lists of "safe" pages that will be used
+ * later. On systems with high memory a list of "safe" highmem pages is
+ * also created.
+ */
+
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+{
+ unsigned int nr_pages, nr_highmem;
+ struct linked_page *sp_list, *lp;
+ int error;
+
+ /* If there is no highmem, the buffer will not be necessary */
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+ buffer = NULL;
+
+ nr_highmem = count_highmem_image_pages(bm);
+ error = mark_unsafe_pages(bm);
+ if (error)
+ goto Free;
+
+ error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(new_bm, bm);
+ memory_bm_free(bm, PG_UNSAFE_KEEP);
+ if (nr_highmem > 0) {
+ error = prepare_highmem_image(bm, &nr_highmem);
+ if (error)
+ goto Free;
+ }
+ /* Reserve some safe pages for potential later use.
+ *
+ * NOTE: This way we make sure there will be enough safe pages for the
+ * chain_alloc() in get_buffer(). It is a bit wasteful, but
+ * nr_copy_pages cannot be greater than 50% of the memory anyway.
+ */
+ sp_list = NULL;
+ /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+ while (nr_pages > 0) {
+ lp = get_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ lp->next = sp_list;
+ sp_list = lp;
+ nr_pages--;
+ }
+ /* Preallocate memory for the image */
+ safe_pages_list = NULL;
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ while (nr_pages > 0) {
+ lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ if (!swsusp_page_is_free(virt_to_page(lp))) {
+ /* The page is "safe", add it to the list */
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+ }
+ /* Mark the page as allocated */
+ swsusp_set_page_forbidden(virt_to_page(lp));
+ swsusp_set_page_free(virt_to_page(lp));
+ nr_pages--;
+ }
+ /* Free the reserved safe pages so that chain_alloc() can use them */
+ while (sp_list) {
+ lp = sp_list->next;
+ free_image_page(sp_list, PG_UNSAFE_CLEAR);
+ sp_list = lp;
+ }
+ return 0;
+
+ Free:
+ swsusp_free();
+ return error;
+}
+
+/**
+ * get_buffer - compute the address that snapshot_write_next() should
+ * set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
+{
+ struct pbe *pbe;
+ struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+
+ if (PageHighMem(page))
+ return get_highmem_page_buffer(page, ca);
+
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
+ */
+ return page_address(page);
+
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
+ */
+ pbe = chain_alloc(ca, sizeof(struct pbe));
+ if (!pbe) {
+ swsusp_free();
+ return NULL;
+ }
+ pbe->orig_address = page_address(page);
+ pbe->address = safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->next = restore_pblist;
+ restore_pblist = pbe;
+ return pbe->address;
+}
+
+/**
+ * snapshot_write_next - used for writing the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * The @count parameter should contain the number of bytes the caller
+ * wants to write to the image. It must not be zero.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro. The number returned
+ * may be smaller than @count, but this only happens if the write would
+ * cross a page boundary otherwise.
+ *
+ * The function returns 0 to indicate the "end of file" condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+ static struct chain_allocator ca;
+ int error = 0;
+
+ /* Check if we have already loaded the entire image */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+ return 0;
+
+ if (handle->offset == 0) {
+ if (!buffer)
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+
+ if (!buffer)
+ return -ENOMEM;
+
+ handle->buffer = buffer;
+ }
+ handle->sync_read = 1;
+ if (handle->prev < handle->cur) {
+ if (handle->prev == 0) {
+ error = load_header(buffer);
+ if (error)
+ return error;
+
+ error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+ if (error)
+ return error;
+
+ } else if (handle->prev <= nr_meta_pages) {
+ unpack_orig_pfns(buffer, &copy_bm);
+ if (handle->prev == nr_meta_pages) {
+ error = prepare_image(&orig_bm, &copy_bm);
+ if (error)
+ return error;
+
+ chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+ memory_bm_position_reset(&orig_bm);
+ restore_pblist = NULL;
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
+ if (!handle->buffer)
+ return -ENOMEM;
+ }
+ } else {
+ copy_last_highmem_page();
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ if (handle->buffer != buffer)
+ handle->sync_read = 0;
+ }
+ handle->prev = handle->cur;
+ }
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
+ } else {
+ handle->cur_offset += count;
+ }
+ handle->offset += count;
+ return count;
+}
+
+/**
+ * snapshot_write_finalize - must be called after the last call to
+ * snapshot_write_next() in case the last page in the image happens
+ * to be a highmem page and its contents should be stored in the
+ * highmem. Additionally, it releases the memory that will not be
+ * used any more.
+ */
+
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+ copy_last_highmem_page();
+ /* Free only if we have loaded the image entirely */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+ memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ free_highmem_data();
+ }
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+ return !(!nr_copy_pages || !last_highmem_page_copied() ||
+ handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+{
+ void *kaddr1, *kaddr2;
+
+ kaddr1 = kmap_atomic(p1, KM_USER0);
+ kaddr2 = kmap_atomic(p2, KM_USER1);
+ memcpy(buf, kaddr1, PAGE_SIZE);
+ memcpy(kaddr1, kaddr2, PAGE_SIZE);
+ memcpy(kaddr2, buf, PAGE_SIZE);
+ kunmap_atomic(kaddr1, KM_USER0);
+ kunmap_atomic(kaddr2, KM_USER1);
+}
+
+/**
+ * restore_highmem - for each highmem page that was allocated before
+ * the suspend and included in the suspend image, and also has been
+ * allocated by the "resume" kernel swap its current (ie. "before
+ * resume") contents with the previous (ie. "before suspend") one.
+ *
+ * If the resume eventually fails, we can call this function once
+ * again and restore the "before resume" highmem state.
+ */
+
+int restore_highmem(void)
+{
+ struct highmem_pbe *pbe = highmem_pblist;
+ void *buf;
+
+ if (!pbe)
+ return 0;
+
+ buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!buf)
+ return -ENOMEM;
+
+ while (pbe) {
+ swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+ pbe = pbe->next;
+ }
+ free_image_page(buf, PG_UNSAFE_CLEAR);
+ return 0;
+}
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 0000000..6da1435
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,647 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+#define SWSUSP_SIG "S1SUSPEND"
+
+struct swsusp_header {
+ char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
+ sector_t image;
+ unsigned int flags; /* Flags to pass to the "boot" kernel */
+ char orig_sig[10];
+ char sig[10];
+} __attribute__((packed));
+
+static struct swsusp_header *swsusp_header;
+
+/*
+ * General things
+ */
+
+static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+
+/**
+ * submit - submit BIO request.
+ * @rw: READ or WRITE.
+ * @off physical offset of page.
+ * @page: page we're reading or writing.
+ * @bio_chain: list of pending biod (for async reading)
+ *
+ * Straight from the textbook - allocate and initialize the bio.
+ * If we're reading, make sure the page is marked as dirty.
+ * Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+ struct bio **bio_chain)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio->bi_bdev = resume_bdev;
+ bio->bi_end_io = end_swap_bio_read;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
+ page_off);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ lock_page(page);
+ bio_get(bio);
+
+ if (bio_chain == NULL) {
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ wait_on_page_locked(page);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ bio_put(bio);
+ } else {
+ if (rw == READ)
+ get_page(page); /* These pages are freed later */
+ bio->bi_private = *bio_chain;
+ *bio_chain = bio;
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ }
+ return 0;
+}
+
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
+}
+
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+ struct bio *bio;
+ struct bio *next_bio;
+ int ret = 0;
+
+ if (bio_chain == NULL)
+ return 0;
+
+ bio = *bio_chain;
+ if (bio == NULL)
+ return 0;
+ while (bio) {
+ struct page *page;
+
+ next_bio = bio->bi_private;
+ page = bio->bi_io_vec[0].bv_page;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page))
+ ret = -EIO;
+ put_page(page);
+ bio_put(bio);
+ bio = next_bio;
+ }
+ *bio_chain = NULL;
+ return ret;
+}
+
+/*
+ * Saving part
+ */
+
+static int mark_swapfiles(sector_t start, unsigned int flags)
+{
+ int error;
+
+ bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
+ !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
+ memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+ swsusp_header->image = start;
+ swsusp_header->flags = flags;
+ error = bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ printk(KERN_ERR "PM: Swap header not found!\n");
+ error = -ENODEV;
+ }
+ return error;
+}
+
+/**
+ * swsusp_swap_check - check if the resume device is a swap device
+ * and get its index (if so)
+ */
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+ int res;
+
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
+ &resume_bdev);
+ if (res < 0)
+ return res;
+
+ root_swap = res;
+ res = blkdev_get(resume_bdev, FMODE_WRITE);
+ if (res)
+ return res;
+
+ res = set_blocksize(resume_bdev, PAGE_SIZE);
+ if (res < 0)
+ blkdev_put(resume_bdev, FMODE_WRITE);
+
+ return res;
+}
+
+/**
+ * write_page - Write one page to given swap location.
+ * @buf: Address we're writing.
+ * @offset: Offset of the swap page we're writing to.
+ * @bio_chain: Link the next write BIO here
+ */
+
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+{
+ void *src;
+
+ if (!offset)
+ return -ENOSPC;
+
+ if (bio_chain) {
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (src) {
+ memcpy(src, buf, PAGE_SIZE);
+ } else {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ src = buf;
+ }
+ } else {
+ src = buf;
+ }
+ return bio_write_page(offset, src, bio_chain);
+}
+
+/*
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition. It consists of many swap_map_page
+ * structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ * These structures are stored on the swap and linked together with the
+ * help of the .next_swap member.
+ *
+ * The swap map is created during suspend. The swap map pages are
+ * allocated and populated one at a time, so we only need one memory
+ * page to set up the entire structure.
+ *
+ * During resume we also only need to use one swap_map_page structure
+ * at a time.
+ */
+
+#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
+
+struct swap_map_page {
+ sector_t entries[MAP_PAGE_ENTRIES];
+ sector_t next_swap;
+};
+
+/**
+ * The swap_map_handle structure is used for handling swap in
+ * a file-alike way
+ */
+
+struct swap_map_handle {
+ struct swap_map_page *cur;
+ sector_t cur_swap;
+ unsigned int k;
+};
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur)
+ free_page((unsigned long)handle->cur);
+ handle->cur = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+ handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+ if (!handle->cur)
+ return -ENOMEM;
+ handle->cur_swap = alloc_swapdev_block(root_swap);
+ if (!handle->cur_swap) {
+ release_swap_writer(handle);
+ return -ENOSPC;
+ }
+ handle->k = 0;
+ return 0;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ int error = 0;
+ sector_t offset;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = alloc_swapdev_block(root_swap);
+ error = write_page(buf, offset, bio_chain);
+ if (error)
+ return error;
+ handle->cur->entries[handle->k++] = offset;
+ if (handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ offset = alloc_swapdev_block(root_swap);
+ if (!offset)
+ return -ENOSPC;
+ handle->cur->next_swap = offset;
+ error = write_page(handle->cur, handle->cur_swap, NULL);
+ if (error)
+ goto out;
+ memset(handle->cur, 0, PAGE_SIZE);
+ handle->cur_swap = offset;
+ handle->k = 0;
+ }
+ out:
+ return error;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur && handle->cur_swap)
+ return write_page(handle->cur, handle->cur_swap, NULL);
+ else
+ return -EINVAL;
+}
+
+/**
+ * save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
+{
+ unsigned int m;
+ int ret;
+ int error = 0;
+ int nr_pages;
+ int err2;
+ struct bio *bio;
+ struct timeval start;
+ struct timeval stop;
+
+ printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
+ nr_to_write);
+ m = nr_to_write / 100;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ do_gettimeofday(&start);
+ do {
+ ret = snapshot_read_next(snapshot, PAGE_SIZE);
+ if (ret > 0) {
+ error = swap_write_page(handle, data_of(*snapshot),
+ &bio);
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ } while (ret > 0);
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
+ if (!error)
+ printk("\b\b\b\bdone\n");
+ swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ return error;
+}
+
+/**
+ * enough_swap - Make sure we have enough swap to save the image.
+ *
+ * Returns TRUE or FALSE after checking the total amount of swap
+ * space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+ unsigned int free_swap = count_swap_pages(root_swap, 1);
+
+ pr_debug("PM: Free swap pages: %u\n", free_swap);
+ return free_swap > nr_pages + PAGES_FOR_IO;
+}
+
+/**
+ * swsusp_write - Write entire image and metadata.
+ * @flags: flags to pass to the "boot" kernel in the image header
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want
+ * them synced (in case something goes wrong) but we DO not want to mark
+ * filesystem clean: it is not. (And it does not matter, if we resume
+ * correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(unsigned int flags)
+{
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+ int error;
+
+ error = swsusp_swap_check();
+ if (error) {
+ printk(KERN_ERR "PM: Cannot find swap device, try "
+ "swapon -a.\n");
+ return error;
+ }
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_read_next(&snapshot, PAGE_SIZE);
+ if (error < PAGE_SIZE) {
+ if (error >= 0)
+ error = -EFAULT;
+
+ goto out;
+ }
+ header = (struct swsusp_info *)data_of(snapshot);
+ if (!enough_swap(header->pages)) {
+ printk(KERN_ERR "PM: Not enough free swap\n");
+ error = -ENOSPC;
+ goto out;
+ }
+ error = get_swap_writer(&handle);
+ if (!error) {
+ sector_t start = handle.cur_swap;
+
+ error = swap_write_page(&handle, header, NULL);
+ if (!error)
+ error = save_image(&handle, &snapshot,
+ header->pages - 1);
+
+ if (!error) {
+ flush_swap_writer(&handle);
+ printk(KERN_INFO "PM: S");
+ error = mark_swapfiles(start, flags);
+ printk("|\n");
+ }
+ }
+ if (error)
+ free_all_swap_pages(root_swap);
+
+ release_swap_writer(&handle);
+ out:
+ swsusp_close(FMODE_WRITE);
+ return error;
+}
+
+/**
+ * The following functions allow us to read data using a swap map
+ * in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+ if (handle->cur)
+ free_page((unsigned long)handle->cur);
+ handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
+{
+ int error;
+
+ if (!start)
+ return -EINVAL;
+
+ handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
+ if (!handle->cur)
+ return -ENOMEM;
+
+ error = bio_read_page(start, handle->cur, NULL);
+ if (error) {
+ release_swap_reader(handle);
+ return error;
+ }
+ handle->k = 0;
+ return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ sector_t offset;
+ int error;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = handle->cur->entries[handle->k];
+ if (!offset)
+ return -EFAULT;
+ error = bio_read_page(offset, buf, bio_chain);
+ if (error)
+ return error;
+ if (++handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
+ handle->k = 0;
+ offset = handle->cur->next_swap;
+ if (!offset)
+ release_swap_reader(handle);
+ else if (!error)
+ error = bio_read_page(offset, handle->cur, NULL);
+ }
+ return error;
+}
+
+/**
+ * load_image - load the image using the swap map handle
+ * @handle and the snapshot handle @snapshot
+ * (assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
+{
+ unsigned int m;
+ int error = 0;
+ struct timeval start;
+ struct timeval stop;
+ struct bio *bio;
+ int err2;
+ unsigned nr_pages;
+
+ printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
+ nr_to_read);
+ m = nr_to_read / 100;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ do_gettimeofday(&start);
+ for ( ; ; ) {
+ error = snapshot_write_next(snapshot, PAGE_SIZE);
+ if (error <= 0)
+ break;
+ error = swap_read_page(handle, data_of(*snapshot), &bio);
+ if (error)
+ break;
+ if (snapshot->sync_read)
+ error = wait_on_bio_chain(&bio);
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
+ if (!error) {
+ printk("\b\b\b\bdone\n");
+ snapshot_write_finalize(snapshot);
+ if (!snapshot_image_loaded(snapshot))
+ error = -ENODATA;
+ }
+ swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ return error;
+}
+
+/**
+ * swsusp_read - read the hibernation image.
+ * @flags_p: flags passed by the "frozen" kernel in the image header should
+ * be written into this memeory location
+ */
+
+int swsusp_read(unsigned int *flags_p)
+{
+ int error;
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+
+ *flags_p = swsusp_header->flags;
+ if (IS_ERR(resume_bdev)) {
+ pr_debug("PM: Image device not initialised\n");
+ return PTR_ERR(resume_bdev);
+ }
+
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_write_next(&snapshot, PAGE_SIZE);
+ if (error < PAGE_SIZE)
+ return error < 0 ? error : -EFAULT;
+ header = (struct swsusp_info *)data_of(snapshot);
+ error = get_swap_reader(&handle, swsusp_header->image);
+ if (!error)
+ error = swap_read_page(&handle, header, NULL);
+ if (!error)
+ error = load_image(&handle, &snapshot, header->pages - 1);
+ release_swap_reader(&handle);
+
+ blkdev_put(resume_bdev, FMODE_READ);
+
+ if (!error)
+ pr_debug("PM: Image successfully loaded\n");
+ else
+ pr_debug("PM: Error %d resuming\n", error);
+ return error;
+}
+
+/**
+ * swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+ int error;
+
+ resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+ if (!IS_ERR(resume_bdev)) {
+ set_blocksize(resume_bdev, PAGE_SIZE);
+ memset(swsusp_header, 0, PAGE_SIZE);
+ error = bio_read_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ if (error)
+ return error;
+
+ if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+ /* Reset swap signature now */
+ error = bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ return -EINVAL;
+ }
+ if (error)
+ blkdev_put(resume_bdev, FMODE_READ);
+ else
+ pr_debug("PM: Signature found, resuming\n");
+ } else {
+ error = PTR_ERR(resume_bdev);
+ }
+
+ if (error)
+ pr_debug("PM: Error %d checking image file\n", error);
+
+ return error;
+}
+
+/**
+ * swsusp_close - close swap device.
+ */
+
+void swsusp_close(fmode_t mode)
+{
+ if (IS_ERR(resume_bdev)) {
+ pr_debug("PM: Image device not initialised\n");
+ return;
+ }
+
+ blkdev_put(resume_bdev, mode);
+}
+
+static int swsusp_header_init(void)
+{
+ swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
+ if (!swsusp_header)
+ panic("Could not allocate memory for swsusp_header\n");
+ return 0;
+}
+
+core_initcall(swsusp_header_init);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
new file mode 100644
index 0000000..023ff2a
--- /dev/null
+++ b/kernel/power/swsusp.c
@@ -0,0 +1,264 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file provides code to write suspend image to swap and read it back.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ * I'd like to thank the following people for their work:
+ *
+ * Pavel Machek <pavel@ucw.cz>:
+ * Modifications, defectiveness pointing, being with me at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk>:
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net>:
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de>:
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro>:
+ * Fixed runaway init
+ *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Reworked the freeing of memory and the handling of swap
+ *
+ * More state savers are welcome. Especially for the scsi layer...
+ *
+ * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
+ */
+
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/rbtree.h>
+
+#include "power.h"
+
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
+int in_suspend __nosavedata = 0;
+
+/**
+ * The following functions are used for tracing the allocated
+ * swap pages, so that they can be freed in case of an error.
+ */
+
+struct swsusp_extent {
+ struct rb_node node;
+ unsigned long start;
+ unsigned long end;
+};
+
+static struct rb_root swsusp_extents = RB_ROOT;
+
+static int swsusp_extents_insert(unsigned long swap_offset)
+{
+ struct rb_node **new = &(swsusp_extents.rb_node);
+ struct rb_node *parent = NULL;
+ struct swsusp_extent *ext;
+
+ /* Figure out where to put the new node */
+ while (*new) {
+ ext = container_of(*new, struct swsusp_extent, node);
+ parent = *new;
+ if (swap_offset < ext->start) {
+ /* Try to merge */
+ if (swap_offset == ext->start - 1) {
+ ext->start--;
+ return 0;
+ }
+ new = &((*new)->rb_left);
+ } else if (swap_offset > ext->end) {
+ /* Try to merge */
+ if (swap_offset == ext->end + 1) {
+ ext->end++;
+ return 0;
+ }
+ new = &((*new)->rb_right);
+ } else {
+ /* It already is in the tree */
+ return -EINVAL;
+ }
+ }
+ /* Add the new node and rebalance the tree. */
+ ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+ if (!ext)
+ return -ENOMEM;
+
+ ext->start = swap_offset;
+ ext->end = swap_offset;
+ rb_link_node(&ext->node, parent, new);
+ rb_insert_color(&ext->node, &swsusp_extents);
+ return 0;
+}
+
+/**
+ * alloc_swapdev_block - allocate a swap page and register that it has
+ * been allocated, so that it can be freed in case of an error.
+ */
+
+sector_t alloc_swapdev_block(int swap)
+{
+ unsigned long offset;
+
+ offset = swp_offset(get_swap_page_of_type(swap));
+ if (offset) {
+ if (swsusp_extents_insert(offset))
+ swap_free(swp_entry(swap, offset));
+ else
+ return swapdev_block(swap, offset);
+ }
+ return 0;
+}
+
+/**
+ * free_all_swap_pages - free swap pages allocated for saving image data.
+ * It also frees the extents used to register which swap entres had been
+ * allocated.
+ */
+
+void free_all_swap_pages(int swap)
+{
+ struct rb_node *node;
+
+ while ((node = swsusp_extents.rb_node)) {
+ struct swsusp_extent *ext;
+ unsigned long offset;
+
+ ext = container_of(node, struct swsusp_extent, node);
+ rb_erase(node, &swsusp_extents);
+ for (offset = ext->start; offset <= ext->end; offset++)
+ swap_free(swp_entry(swap, offset));
+
+ kfree(ext);
+ }
+}
+
+int swsusp_swap_in_use(void)
+{
+ return (swsusp_extents.rb_node != NULL);
+}
+
+/**
+ * swsusp_show_speed - print the time elapsed between two events represented by
+ * @start and @stop
+ *
+ * @nr_pages - number of pages processed between @start and @stop
+ * @msg - introductory message to print
+ */
+
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+ unsigned nr_pages, char *msg)
+{
+ s64 elapsed_centisecs64;
+ int centisecs;
+ int k;
+ int kps;
+
+ elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ centisecs = elapsed_centisecs64;
+ if (centisecs == 0)
+ centisecs = 1; /* avoid div-by-zero */
+ k = nr_pages * (PAGE_SIZE / 1024);
+ kps = (k * 100) / centisecs;
+ printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+ msg, k,
+ centisecs / 100, centisecs % 100,
+ kps / 1000, (kps % 1000) / 10);
+}
+
+/**
+ * swsusp_shrink_memory - Try to free as much memory as needed
+ *
+ * ... but do not OOM-kill anyone
+ *
+ * Notice: all userland should be stopped before it is called, or
+ * livelock is possible.
+ */
+
+#define SHRINK_BITE 10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+ if (tmp > SHRINK_BITE)
+ tmp = SHRINK_BITE;
+ return shrink_all_memory(tmp);
+}
+
+int swsusp_shrink_memory(void)
+{
+ long tmp;
+ struct zone *zone;
+ unsigned long pages = 0;
+ unsigned int i = 0;
+ char *p = "-\\|/";
+ struct timeval start, stop;
+
+ printk(KERN_INFO "PM: Shrinking memory... ");
+ do_gettimeofday(&start);
+ do {
+ long size, highmem_size;
+
+ highmem_size = count_highmem_pages();
+ size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
+ tmp = size;
+ size += highmem_size;
+ for_each_zone (zone)
+ if (populated_zone(zone)) {
+ tmp += snapshot_additional_pages(zone);
+ if (is_highmem(zone)) {
+ highmem_size -=
+ zone_page_state(zone, NR_FREE_PAGES);
+ } else {
+ tmp -= zone_page_state(zone, NR_FREE_PAGES);
+ tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ }
+ }
+
+ if (highmem_size < 0)
+ highmem_size = 0;
+
+ tmp += highmem_size;
+ if (tmp > 0) {
+ tmp = __shrink_memory(tmp);
+ if (!tmp)
+ return -ENOMEM;
+ pages += tmp;
+ } else if (size > image_size / PAGE_SIZE) {
+ tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+ pages += tmp;
+ }
+ printk("\b%c", p[i++%4]);
+ } while (tmp > 0);
+ do_gettimeofday(&stop);
+ printk("\bdone (%lu pages freed)\n", pages);
+ swsusp_show_speed(&start, &stop, pages, "Freed");
+
+ return 0;
+}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 0000000..005b93d
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,443 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/smp_lock.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+/*
+ * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
+ * will be removed in the future. They are only preserved here for
+ * compatibility with existing userland utilities.
+ */
+#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+
+#define PMOPS_PREPARE 1
+#define PMOPS_ENTER 2
+#define PMOPS_FINISH 3
+
+/*
+ * NOTE: The following ioctl definitions are wrong and have been replaced with
+ * correct ones. They are only preserved here for compatibility with existing
+ * userland utilities and will be removed in the future.
+ */
+#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+
+
+#define SNAPSHOT_MINOR 231
+
+static struct snapshot_data {
+ struct snapshot_handle handle;
+ int swap;
+ int mode;
+ char frozen;
+ char ready;
+ char platform_support;
+} snapshot_state;
+
+atomic_t snapshot_device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+ int error;
+
+ mutex_lock(&pm_mutex);
+
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+
+ if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
+ atomic_inc(&snapshot_device_available);
+ error = -ENOSYS;
+ goto Unlock;
+ }
+ if(create_basic_memory_bitmaps()) {
+ atomic_inc(&snapshot_device_available);
+ error = -ENOMEM;
+ goto Unlock;
+ }
+ nonseekable_open(inode, filp);
+ data = &snapshot_state;
+ filp->private_data = data;
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ data->swap = swsusp_resume_device ?
+ swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->mode = O_RDONLY;
+ error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (error)
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ } else {
+ data->swap = -1;
+ data->mode = O_WRONLY;
+ error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ if (error)
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ }
+ if (error)
+ atomic_inc(&snapshot_device_available);
+ data->frozen = 0;
+ data->ready = 0;
+ data->platform_support = 0;
+
+ Unlock:
+ mutex_unlock(&pm_mutex);
+
+ return error;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ mutex_lock(&pm_mutex);
+
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ data = filp->private_data;
+ free_all_swap_pages(data->swap);
+ if (data->frozen)
+ thaw_processes();
+ pm_notifier_call_chain(data->mode == O_WRONLY ?
+ PM_POST_HIBERNATION : PM_POST_RESTORE);
+ atomic_inc(&snapshot_device_available);
+
+ mutex_unlock(&pm_mutex);
+
+ return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ mutex_lock(&pm_mutex);
+
+ data = filp->private_data;
+ if (!data->ready) {
+ res = -ENODATA;
+ goto Unlock;
+ }
+ res = snapshot_read_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_to_user(buf, data_of(data->handle), res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+
+ Unlock:
+ mutex_unlock(&pm_mutex);
+
+ return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ mutex_lock(&pm_mutex);
+
+ data = filp->private_data;
+ res = snapshot_write_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_from_user(data_of(data->handle), buf, res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+
+ mutex_unlock(&pm_mutex);
+
+ return res;
+}
+
+static long snapshot_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int error = 0;
+ struct snapshot_data *data;
+ loff_t size;
+ sector_t offset;
+
+ if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+ return -ENOTTY;
+ if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+ return -ENOTTY;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ data = filp->private_data;
+
+ switch (cmd) {
+
+ case SNAPSHOT_FREEZE:
+ if (data->frozen)
+ break;
+
+ printk("Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ error = usermodehelper_disable();
+ if (error)
+ break;
+
+ error = freeze_processes();
+ if (error) {
+ thaw_processes();
+ usermodehelper_enable();
+ }
+ if (!error)
+ data->frozen = 1;
+ break;
+
+ case SNAPSHOT_UNFREEZE:
+ if (!data->frozen || data->ready)
+ break;
+ thaw_processes();
+ usermodehelper_enable();
+ data->frozen = 0;
+ break;
+
+ case SNAPSHOT_CREATE_IMAGE:
+ case SNAPSHOT_ATOMIC_SNAPSHOT:
+ if (data->mode != O_RDONLY || !data->frozen || data->ready) {
+ error = -EPERM;
+ break;
+ }
+ error = hibernation_snapshot(data->platform_support);
+ if (!error)
+ error = put_user(in_suspend, (int __user *)arg);
+ if (!error)
+ data->ready = 1;
+ break;
+
+ case SNAPSHOT_ATOMIC_RESTORE:
+ snapshot_write_finalize(&data->handle);
+ if (data->mode != O_WRONLY || !data->frozen ||
+ !snapshot_image_loaded(&data->handle)) {
+ error = -EPERM;
+ break;
+ }
+ error = hibernation_restore(data->platform_support);
+ break;
+
+ case SNAPSHOT_FREE:
+ swsusp_free();
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ data->ready = 0;
+ break;
+
+ case SNAPSHOT_PREF_IMAGE_SIZE:
+ case SNAPSHOT_SET_IMAGE_SIZE:
+ image_size = arg;
+ break;
+
+ case SNAPSHOT_GET_IMAGE_SIZE:
+ if (!data->ready) {
+ error = -ENODATA;
+ break;
+ }
+ size = snapshot_get_image_size();
+ size <<= PAGE_SHIFT;
+ error = put_user(size, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_AVAIL_SWAP_SIZE:
+ case SNAPSHOT_AVAIL_SWAP:
+ size = count_swap_pages(data->swap, 1);
+ size <<= PAGE_SHIFT;
+ error = put_user(size, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_ALLOC_SWAP_PAGE:
+ case SNAPSHOT_GET_SWAP_PAGE:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ offset = alloc_swapdev_block(data->swap);
+ if (offset) {
+ offset <<= PAGE_SHIFT;
+ error = put_user(offset, (loff_t __user *)arg);
+ } else {
+ error = -ENOSPC;
+ }
+ break;
+
+ case SNAPSHOT_FREE_SWAP_PAGES:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ free_all_swap_pages(data->swap);
+ break;
+
+ case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+ if (!swsusp_swap_in_use()) {
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ if (old_decode_dev(arg)) {
+ data->swap = swap_type_of(old_decode_dev(arg),
+ 0, NULL);
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ } else {
+ error = -EPERM;
+ }
+ break;
+
+ case SNAPSHOT_S2RAM:
+ if (!data->frozen) {
+ error = -EPERM;
+ break;
+ }
+ /*
+ * Tasks are frozen and the notifiers have been called with
+ * PM_HIBERNATION_PREPARE
+ */
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ break;
+
+ case SNAPSHOT_PLATFORM_SUPPORT:
+ data->platform_support = !!arg;
+ break;
+
+ case SNAPSHOT_POWER_OFF:
+ if (data->platform_support)
+ error = hibernation_platform_enter();
+ break;
+
+ case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+ error = -EINVAL;
+
+ switch (arg) {
+
+ case PMOPS_PREPARE:
+ data->platform_support = 1;
+ error = 0;
+ break;
+
+ case PMOPS_ENTER:
+ if (data->platform_support)
+ error = hibernation_platform_enter();
+ break;
+
+ case PMOPS_FINISH:
+ if (data->platform_support)
+ error = 0;
+ break;
+
+ default:
+ printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+
+ }
+ break;
+
+ case SNAPSHOT_SET_SWAP_AREA:
+ if (swsusp_swap_in_use()) {
+ error = -EPERM;
+ } else {
+ struct resume_swap_area swap_area;
+ dev_t swdev;
+
+ error = copy_from_user(&swap_area, (void __user *)arg,
+ sizeof(struct resume_swap_area));
+ if (error) {
+ error = -EFAULT;
+ break;
+ }
+
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ swdev = old_decode_dev(swap_area.dev);
+ if (swdev) {
+ offset = swap_area.offset;
+ data->swap = swap_type_of(swdev, offset, NULL);
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ }
+ break;
+
+ default:
+ error = -ENOTTY;
+
+ }
+
+ mutex_unlock(&pm_mutex);
+
+ return error;
+}
+
+static const struct file_operations snapshot_fops = {
+ .open = snapshot_open,
+ .release = snapshot_release,
+ .read = snapshot_read,
+ .write = snapshot_write,
+ .llseek = no_llseek,
+ .unlocked_ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+ .minor = SNAPSHOT_MINOR,
+ .name = "snapshot",
+ .fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+ return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/kernel/printk.c b/kernel/printk.c
new file mode 100644
index 0000000..b84eec7
--- /dev/null
+++ b/kernel/printk.c
@@ -0,0 +1,1299 @@
+/*
+ * linux/kernel/printk.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Modified to make sys_syslog() more flexible: added commands to
+ * return the last 4k of kernel messages, regardless of whether
+ * they've been read or not. Added option to suppress kernel printk's
+ * to the console. Added hook for sending the console messages
+ * elsewhere, in preparation for a serial line console (someday).
+ * Ted Ts'o, 2/11/93.
+ * Modified for sysctl support, 1/8/97, Chris Horn.
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
+ * manfred@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ * 01Mar01 Andrew Morton
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h> /* For in_interrupt() */
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Architectures can override it:
+ */
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
+{
+}
+
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+
+/* printk's without a loglevel use this.. */
+#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+
+/* We show everything that is MORE important than this.. */
+#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
+#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
+
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+
+int console_printk[4] = {
+ DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
+ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
+ MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
+ DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
+};
+
+/*
+ * Low level drivers may need that to know if they can schedule in
+ * their unblank() callback or not. So let's export it.
+ */
+int oops_in_progress;
+EXPORT_SYMBOL(oops_in_progress);
+
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
+struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
+
+/*
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it are racing, but it helps tracking those weird code
+ * path in the console code where we end up in places I want
+ * locked without the console sempahore held
+ */
+static int console_locked, console_suspended;
+
+/*
+ * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
+ * It is also used in interesting ways to provide interlocking in
+ * release_console_sem().
+ */
+static DEFINE_SPINLOCK(logbuf_lock);
+
+#define LOG_BUF_MASK (log_buf_len-1)
+#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+
+/*
+ * The indices into log_buf are not constrained to log_buf_len - they
+ * must be masked before subscripting
+ */
+static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
+static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
+static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
+
+/*
+ * Array of consoles built from command line options (console=)
+ */
+struct console_cmdline
+{
+ char name[8]; /* Name of the driver */
+ int index; /* Minor dev. to use */
+ char *options; /* Options for the driver */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ char *brl_options; /* Options for braille driver */
+#endif
+};
+
+#define MAX_CMDLINECONSOLES 8
+
+static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+static int selected_console = -1;
+static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
+
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
+#ifdef CONFIG_PRINTK
+
+static char __log_buf[__LOG_BUF_LEN];
+static char *log_buf = __log_buf;
+static int log_buf_len = __LOG_BUF_LEN;
+static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+ unsigned long flags;
+
+ if (size)
+ size = roundup_pow_of_two(size);
+ if (size > log_buf_len) {
+ unsigned start, dest_idx, offset;
+ char *new_log_buf;
+
+ new_log_buf = alloc_bootmem(size);
+ if (!new_log_buf) {
+ printk(KERN_WARNING "log_buf_len: allocation failed\n");
+ goto out;
+ }
+
+ spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = size;
+ log_buf = new_log_buf;
+
+ offset = start = min(con_start, log_start);
+ dest_idx = 0;
+ while (start != log_end) {
+ log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
+ start++;
+ dest_idx++;
+ }
+ log_start -= offset;
+ con_start -= offset;
+ log_end -= offset;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+ }
+out:
+ return 1;
+}
+
+__setup("log_buf_len=", log_buf_len_setup);
+
+#ifdef CONFIG_BOOT_PRINTK_DELAY
+
+static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */
+
+static int __init boot_delay_setup(char *str)
+{
+ unsigned long lpj;
+ unsigned long long loops_per_msec;
+
+ lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
+ loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
+
+ get_option(&str, &boot_delay);
+ if (boot_delay > 10 * 1000)
+ boot_delay = 0;
+
+ printk_delay_msec = loops_per_msec;
+ printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
+ "HZ: %d, printk_delay_msec: %llu\n",
+ boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
+ return 1;
+}
+__setup("boot_delay=", boot_delay_setup);
+
+static void boot_delay_msec(void)
+{
+ unsigned long long k;
+ unsigned long timeout;
+
+ if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ return;
+
+ k = (unsigned long long)printk_delay_msec * boot_delay;
+
+ timeout = jiffies + msecs_to_jiffies(boot_delay);
+ while (k) {
+ k--;
+ cpu_relax();
+ /*
+ * use (volatile) jiffies to prevent
+ * compiler reduction; loop termination via jiffies
+ * is secondary and may or may not happen.
+ */
+ if (time_after(jiffies, timeout))
+ break;
+ touch_nmi_watchdog();
+ }
+}
+#else
+static inline void boot_delay_msec(void)
+{
+}
+#endif
+
+/*
+ * Commands to do_syslog:
+ *
+ * 0 -- Close the log. Currently a NOP.
+ * 1 -- Open the log. Currently a NOP.
+ * 2 -- Read from the log.
+ * 3 -- Read all messages remaining in the ring buffer.
+ * 4 -- Read and clear all messages remaining in the ring buffer
+ * 5 -- Clear ring buffer.
+ * 6 -- Disable printk's to console
+ * 7 -- Enable printk's to console
+ * 8 -- Set level of messages printed to console
+ * 9 -- Return number of unread characters in the log buffer
+ * 10 -- Return size of the log buffer
+ */
+int do_syslog(int type, char __user *buf, int len)
+{
+ unsigned i, j, limit, count;
+ int do_clear = 0;
+ char c;
+ int error = 0;
+
+ error = security_syslog(type);
+ if (error)
+ return error;
+
+ switch (type) {
+ case 0: /* Close log */
+ break;
+ case 1: /* Open log */
+ break;
+ case 2: /* Read from log */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = wait_event_interruptible(log_wait,
+ (log_start - log_end));
+ if (error)
+ goto out;
+ i = 0;
+ spin_lock_irq(&logbuf_lock);
+ while (!error && (log_start != log_end) && i < len) {
+ c = LOG_BUF(log_start);
+ log_start++;
+ spin_unlock_irq(&logbuf_lock);
+ error = __put_user(c,buf);
+ buf++;
+ i++;
+ cond_resched();
+ spin_lock_irq(&logbuf_lock);
+ }
+ spin_unlock_irq(&logbuf_lock);
+ if (!error)
+ error = i;
+ break;
+ case 4: /* Read/clear last kernel messages */
+ do_clear = 1;
+ /* FALL THRU */
+ case 3: /* Read last kernel messages */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ count = len;
+ if (count > log_buf_len)
+ count = log_buf_len;
+ spin_lock_irq(&logbuf_lock);
+ if (count > logged_chars)
+ count = logged_chars;
+ if (do_clear)
+ logged_chars = 0;
+ limit = log_end;
+ /*
+ * __put_user() could sleep, and while we sleep
+ * printk() could overwrite the messages
+ * we try to copy to user space. Therefore
+ * the messages are copied in reverse. <manfreds>
+ */
+ for (i = 0; i < count && !error; i++) {
+ j = limit-1-i;
+ if (j + log_buf_len < log_end)
+ break;
+ c = LOG_BUF(j);
+ spin_unlock_irq(&logbuf_lock);
+ error = __put_user(c,&buf[count-1-i]);
+ cond_resched();
+ spin_lock_irq(&logbuf_lock);
+ }
+ spin_unlock_irq(&logbuf_lock);
+ if (error)
+ break;
+ error = i;
+ if (i != count) {
+ int offset = count-error;
+ /* buffer overflow during copy, correct user buffer. */
+ for (i = 0; i < error; i++) {
+ if (__get_user(c,&buf[i+offset]) ||
+ __put_user(c,&buf[i])) {
+ error = -EFAULT;
+ break;
+ }
+ cond_resched();
+ }
+ }
+ break;
+ case 5: /* Clear ring buffer */
+ logged_chars = 0;
+ break;
+ case 6: /* Disable logging to console */
+ console_loglevel = minimum_console_loglevel;
+ break;
+ case 7: /* Enable logging to console */
+ console_loglevel = default_console_loglevel;
+ break;
+ case 8: /* Set level of messages printed to console */
+ error = -EINVAL;
+ if (len < 1 || len > 8)
+ goto out;
+ if (len < minimum_console_loglevel)
+ len = minimum_console_loglevel;
+ console_loglevel = len;
+ error = 0;
+ break;
+ case 9: /* Number of chars in the log buffer */
+ error = log_end - log_start;
+ break;
+ case 10: /* Size of the log buffer */
+ error = log_buf_len;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
+{
+ return do_syslog(type, buf, len);
+}
+
+/*
+ * Call the console drivers on a range of log_buf
+ */
+static void __call_console_drivers(unsigned start, unsigned end)
+{
+ struct console *con;
+
+ for (con = console_drivers; con; con = con->next) {
+ if ((con->flags & CON_ENABLED) && con->write &&
+ (cpu_online(smp_processor_id()) ||
+ (con->flags & CON_ANYTIME)))
+ con->write(con, &LOG_BUF(start), end - start);
+ }
+}
+
+static int __read_mostly ignore_loglevel;
+
+static int __init ignore_loglevel_setup(char *str)
+{
+ ignore_loglevel = 1;
+ printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+
+ return 0;
+}
+
+early_param("ignore_loglevel", ignore_loglevel_setup);
+
+/*
+ * Write out chars from start to end - 1 inclusive
+ */
+static void _call_console_drivers(unsigned start,
+ unsigned end, int msg_log_level)
+{
+ if ((msg_log_level < console_loglevel || ignore_loglevel) &&
+ console_drivers && start != end) {
+ if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
+ /* wrapped write */
+ __call_console_drivers(start & LOG_BUF_MASK,
+ log_buf_len);
+ __call_console_drivers(0, end & LOG_BUF_MASK);
+ } else {
+ __call_console_drivers(start, end);
+ }
+ }
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_sem must be held.
+ */
+static void call_console_drivers(unsigned start, unsigned end)
+{
+ unsigned cur_index, start_print;
+ static int msg_level = -1;
+
+ BUG_ON(((int)(start - end)) > 0);
+
+ cur_index = start;
+ start_print = start;
+ while (cur_index != end) {
+ if (msg_level < 0 && ((end - cur_index) > 2) &&
+ LOG_BUF(cur_index + 0) == '<' &&
+ LOG_BUF(cur_index + 1) >= '0' &&
+ LOG_BUF(cur_index + 1) <= '7' &&
+ LOG_BUF(cur_index + 2) == '>') {
+ msg_level = LOG_BUF(cur_index + 1) - '0';
+ cur_index += 3;
+ start_print = cur_index;
+ }
+ while (cur_index != end) {
+ char c = LOG_BUF(cur_index);
+
+ cur_index++;
+ if (c == '\n') {
+ if (msg_level < 0) {
+ /*
+ * printk() has already given us loglevel tags in
+ * the buffer. This code is here in case the
+ * log buffer has wrapped right round and scribbled
+ * on those tags
+ */
+ msg_level = default_message_loglevel;
+ }
+ _call_console_drivers(start_print, cur_index, msg_level);
+ msg_level = -1;
+ start_print = cur_index;
+ break;
+ }
+ }
+ }
+ _call_console_drivers(start_print, end, msg_level);
+}
+
+static void emit_log_char(char c)
+{
+ LOG_BUF(log_end) = c;
+ log_end++;
+ if (log_end - log_start > log_buf_len)
+ log_start = log_end - log_buf_len;
+ if (log_end - con_start > log_buf_len)
+ con_start = log_end - log_buf_len;
+ if (logged_chars < log_buf_len)
+ logged_chars++;
+}
+
+/*
+ * Zap console related locks when oopsing. Only zap at most once
+ * every 10 seconds, to leave time for slow consoles to print a
+ * full oops.
+ */
+static void zap_locks(void)
+{
+ static unsigned long oops_timestamp;
+
+ if (time_after_eq(jiffies, oops_timestamp) &&
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
+ return;
+
+ oops_timestamp = jiffies;
+
+ /* If a crash is occurring, make sure we can't deadlock */
+ spin_lock_init(&logbuf_lock);
+ /* And make sure that we print immediately */
+ init_MUTEX(&console_sem);
+}
+
+#if defined(CONFIG_PRINTK_TIME)
+static int printk_time = 1;
+#else
+static int printk_time = 0;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+
+/* Check if we have any console registered that can be called early in boot. */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for (con = console_drivers; con; con = con->next)
+ if (con->flags & CON_ANYTIME)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_sem. If we succeed, it's easy - we log the output and
+ * call the console drivers. If we fail to get the semaphore we place the output
+ * into the log buffer and return. The current holder of the console_sem will
+ * notice the new output in release_console_sem() and will send it to the
+ * consoles before releasing the semaphore.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+
+asmlinkage int printk(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk(fmt, args);
+ va_end(args);
+
+ return r;
+}
+
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
+ */
+static inline int can_use_console(unsigned int cpu)
+{
+ return cpu_online(cpu) || have_callable_console();
+}
+
+/*
+ * Try to get console ownership to actually show the kernel
+ * messages from a 'printk'. Return true (and with the
+ * console_semaphore held, and 'console_locked' set) if it
+ * is successful, false otherwise.
+ *
+ * This gets called with the 'logbuf_lock' spinlock held and
+ * interrupts disabled. It should return with 'lockbuf_lock'
+ * released but interrupts still disabled.
+ */
+static int acquire_console_semaphore_for_printk(unsigned int cpu)
+{
+ int retval = 0;
+
+ if (!try_acquire_console_sem()) {
+ retval = 1;
+
+ /*
+ * If we can't use the console, we need to release
+ * the console semaphore by hand to avoid flushing
+ * the buffer. We need to hold the console semaphore
+ * in order to do this test safely.
+ */
+ if (!can_use_console(cpu)) {
+ console_locked = 0;
+ up(&console_sem);
+ retval = 0;
+ }
+ }
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+ return retval;
+}
+static const char recursion_bug_msg [] =
+ KERN_CRIT "BUG: recent printk recursion!\n";
+static int recursion_bug;
+ static int new_text_line = 1;
+static char printk_buf[1024];
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ int printed_len = 0;
+ int current_log_level = default_message_loglevel;
+ unsigned long flags;
+ int this_cpu;
+ char *p;
+
+ boot_delay_msec();
+
+ preempt_disable();
+ /* This stops the holder of console_sem just where we want him */
+ raw_local_irq_save(flags);
+ this_cpu = smp_processor_id();
+
+ /*
+ * Ouch, printk recursed into itself!
+ */
+ if (unlikely(printk_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+ * we can't deadlock. Otherwise just return to avoid the
+ * recursion and return - but flag the recursion so that
+ * it can be printed at the next appropriate moment:
+ */
+ if (!oops_in_progress) {
+ recursion_bug = 1;
+ goto out_restore_irqs;
+ }
+ zap_locks();
+ }
+
+ lockdep_off();
+ spin_lock(&logbuf_lock);
+ printk_cpu = this_cpu;
+
+ if (recursion_bug) {
+ recursion_bug = 0;
+ strcpy(printk_buf, recursion_bug_msg);
+ printed_len = sizeof(recursion_bug_msg);
+ }
+ /* Emit the output into the temporary buffer */
+ printed_len += vscnprintf(printk_buf + printed_len,
+ sizeof(printk_buf) - printed_len, fmt, args);
+
+
+ /*
+ * Copy the output into log_buf. If the caller didn't provide
+ * appropriate log level tags, we insert them here
+ */
+ for (p = printk_buf; *p; p++) {
+ if (new_text_line) {
+ /* If a token, set current_log_level and skip over */
+ if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
+ p[2] == '>') {
+ current_log_level = p[1] - '0';
+ p += 3;
+ printed_len -= 3;
+ }
+
+ /* Always output the token */
+ emit_log_char('<');
+ emit_log_char(current_log_level + '0');
+ emit_log_char('>');
+ printed_len += 3;
+ new_text_line = 0;
+
+ if (printk_time) {
+ /* Follow the token with the time */
+ char tbuf[50], *tp;
+ unsigned tlen;
+ unsigned long long t;
+ unsigned long nanosec_rem;
+
+ t = cpu_clock(printk_cpu);
+ nanosec_rem = do_div(t, 1000000000);
+ tlen = sprintf(tbuf, "[%5lu.%06lu] ",
+ (unsigned long) t,
+ nanosec_rem / 1000);
+
+ for (tp = tbuf; tp < tbuf + tlen; tp++)
+ emit_log_char(*tp);
+ printed_len += tlen;
+ }
+
+ if (!*p)
+ break;
+ }
+
+ emit_log_char(*p);
+ if (*p == '\n')
+ new_text_line = 1;
+ }
+
+ /*
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
+ * actual magic (print out buffers, wake up klogd,
+ * etc).
+ *
+ * The acquire_console_semaphore_for_printk() function
+ * will release 'logbuf_lock' regardless of whether it
+ * actually gets the semaphore or not.
+ */
+ if (acquire_console_semaphore_for_printk(this_cpu))
+ release_console_sem();
+
+ lockdep_on();
+out_restore_irqs:
+ raw_local_irq_restore(flags);
+
+ preempt_enable();
+ return printed_len;
+}
+EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(vprintk);
+
+#else
+
+static void call_console_drivers(unsigned start, unsigned end)
+{
+}
+
+#endif
+
+static int __add_preferred_console(char *name, int idx, char *options,
+ char *brl_options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+ if (strcmp(console_cmdline[i].name, name) == 0 &&
+ console_cmdline[i].index == idx) {
+ if (!brl_options)
+ selected_console = i;
+ return 0;
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return -E2BIG;
+ if (!brl_options)
+ selected_console = i;
+ c = &console_cmdline[i];
+ strlcpy(c->name, name, sizeof(c->name));
+ c->options = options;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ c->brl_options = brl_options;
+#endif
+ c->index = idx;
+ return 0;
+}
+/*
+ * Set up a list of consoles. Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char *s, *options, *brl_options = NULL;
+ int idx;
+
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ if (!memcmp(str, "brl,", 4)) {
+ brl_options = "";
+ str += 4;
+ } else if (!memcmp(str, "brl=", 4)) {
+ brl_options = str + 4;
+ str = strchr(brl_options, ',');
+ if (!str) {
+ printk(KERN_ERR "need port name after brl=\n");
+ return 1;
+ }
+ *(str++) = 0;
+ }
+#endif
+
+ /*
+ * Decode str into name, index, options.
+ */
+ if (str[0] >= '0' && str[0] <= '9') {
+ strcpy(buf, "ttyS");
+ strncpy(buf + 4, str, sizeof(buf) - 5);
+ } else {
+ strncpy(buf, str, sizeof(buf) - 1);
+ }
+ buf[sizeof(buf) - 1] = 0;
+ if ((options = strchr(str, ',')) != NULL)
+ *(options++) = 0;
+#ifdef __sparc__
+ if (!strcmp(str, "ttya"))
+ strcpy(buf, "ttyS0");
+ if (!strcmp(str, "ttyb"))
+ strcpy(buf, "ttyS1");
+#endif
+ for (s = buf; *s; s++)
+ if ((*s >= '0' && *s <= '9') || *s == ',')
+ break;
+ idx = simple_strtoul(s, NULL, 10);
+ *s = 0;
+
+ __add_preferred_console(buf, idx, options, brl_options);
+ console_set_on_cmdline = 1;
+ return 1;
+}
+__setup("console=", console_setup);
+
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ * @name: device name
+ * @idx: device index
+ * @options: options for this console
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init. Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int add_preferred_console(char *name, int idx, char *options)
+{
+ return __add_preferred_console(name, idx, options, NULL);
+}
+
+int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+ if (strcmp(console_cmdline[i].name, name) == 0 &&
+ console_cmdline[i].index == idx) {
+ c = &console_cmdline[i];
+ strlcpy(c->name, name_new, sizeof(c->name));
+ c->name[sizeof(c->name) - 1] = 0;
+ c->options = options;
+ c->index = idx_new;
+ return i;
+ }
+ /* not found */
+ return -1;
+}
+
+int console_suspend_enabled = 1;
+EXPORT_SYMBOL(console_suspend_enabled);
+
+static int __init console_suspend_disable(char *str)
+{
+ console_suspend_enabled = 0;
+ return 1;
+}
+__setup("no_console_suspend", console_suspend_disable);
+
+/**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ printk("Suspending console(s) (use no_console_suspend to debug)\n");
+ acquire_console_sem();
+ console_suspended = 1;
+}
+
+void resume_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ console_suspended = 0;
+ release_console_sem();
+}
+
+/**
+ * acquire_console_sem - lock the console system for exclusive use.
+ *
+ * Acquires a semaphore which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void acquire_console_sem(void)
+{
+ BUG_ON(in_interrupt());
+ if (console_suspended) {
+ down(&secondary_console_sem);
+ return;
+ }
+ down(&console_sem);
+ console_locked = 1;
+ console_may_schedule = 1;
+}
+EXPORT_SYMBOL(acquire_console_sem);
+
+int try_acquire_console_sem(void)
+{
+ if (down_trylock(&console_sem))
+ return -1;
+ console_locked = 1;
+ console_may_schedule = 0;
+ return 0;
+}
+EXPORT_SYMBOL(try_acquire_console_sem);
+
+int is_console_locked(void)
+{
+ return console_locked;
+}
+
+static DEFINE_PER_CPU(int, printk_pending);
+
+void printk_tick(void)
+{
+ if (__get_cpu_var(printk_pending)) {
+ __get_cpu_var(printk_pending) = 0;
+ wake_up_interruptible(&log_wait);
+ }
+}
+
+int printk_needs_cpu(int cpu)
+{
+ return per_cpu(printk_pending, cpu);
+}
+
+void wake_up_klogd(void)
+{
+ if (waitqueue_active(&log_wait))
+ __raw_get_cpu_var(printk_pending) = 1;
+}
+
+/**
+ * release_console_sem - unlock the console system
+ *
+ * Releases the semaphore which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the semaphore was held, console output may have been buffered
+ * by printk(). If this is the case, release_console_sem() emits
+ * the output prior to releasing the semaphore.
+ *
+ * If there is output waiting for klogd, we wake it up.
+ *
+ * release_console_sem() may be called from any context.
+ */
+void release_console_sem(void)
+{
+ unsigned long flags;
+ unsigned _con_start, _log_end;
+ unsigned wake_klogd = 0;
+
+ if (console_suspended) {
+ up(&secondary_console_sem);
+ return;
+ }
+
+ console_may_schedule = 0;
+
+ for ( ; ; ) {
+ spin_lock_irqsave(&logbuf_lock, flags);
+ wake_klogd |= log_start - log_end;
+ if (con_start == log_end)
+ break; /* Nothing to print */
+ _con_start = con_start;
+ _log_end = log_end;
+ con_start = log_end; /* Flush */
+ spin_unlock(&logbuf_lock);
+ stop_critical_timings(); /* don't trace print latency */
+ call_console_drivers(_con_start, _log_end);
+ start_critical_timings();
+ local_irq_restore(flags);
+ }
+ console_locked = 0;
+ up(&console_sem);
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (wake_klogd)
+ wake_up_klogd();
+}
+EXPORT_SYMBOL(release_console_sem);
+
+/**
+ * console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within acquire_console_sem().
+ */
+void __sched console_conditional_schedule(void)
+{
+ if (console_may_schedule)
+ cond_resched();
+}
+EXPORT_SYMBOL(console_conditional_schedule);
+
+void console_print(const char *s)
+{
+ printk(KERN_EMERG "%s", s);
+}
+EXPORT_SYMBOL(console_print);
+
+void console_unblank(void)
+{
+ struct console *c;
+
+ /*
+ * console_unblank can no longer be called in interrupt context unless
+ * oops_in_progress is set to 1..
+ */
+ if (oops_in_progress) {
+ if (down_trylock(&console_sem) != 0)
+ return;
+ } else
+ acquire_console_sem();
+
+ console_locked = 1;
+ console_may_schedule = 0;
+ for (c = console_drivers; c != NULL; c = c->next)
+ if ((c->flags & CON_ENABLED) && c->unblank)
+ c->unblank();
+ release_console_sem();
+}
+
+/*
+ * Return the console tty driver structure and its associated index
+ */
+struct tty_driver *console_device(int *index)
+{
+ struct console *c;
+ struct tty_driver *driver = NULL;
+
+ acquire_console_sem();
+ for (c = console_drivers; c != NULL; c = c->next) {
+ if (!c->device)
+ continue;
+ driver = c->device(c, index);
+ if (driver)
+ break;
+ }
+ release_console_sem();
+ return driver;
+}
+
+/*
+ * Prevent further output on the passed console device so that (for example)
+ * serial drivers can disable console output before suspending a port, and can
+ * re-enable output afterwards.
+ */
+void console_stop(struct console *console)
+{
+ acquire_console_sem();
+ console->flags &= ~CON_ENABLED;
+ release_console_sem();
+}
+EXPORT_SYMBOL(console_stop);
+
+void console_start(struct console *console)
+{
+ acquire_console_sem();
+ console->flags |= CON_ENABLED;
+ release_console_sem();
+}
+EXPORT_SYMBOL(console_start);
+
+/*
+ * The console driver calls this routine during kernel initialization
+ * to register the console printing procedure with printk() and to
+ * print any messages that were printed by the kernel before the
+ * console driver was initialized.
+ */
+void register_console(struct console *console)
+{
+ int i;
+ unsigned long flags;
+ struct console *bootconsole = NULL;
+
+ if (console_drivers) {
+ if (console->flags & CON_BOOT)
+ return;
+ if (console_drivers->flags & CON_BOOT)
+ bootconsole = console_drivers;
+ }
+
+ if (preferred_console < 0 || bootconsole || !console_drivers)
+ preferred_console = selected_console;
+
+ if (console->early_setup)
+ console->early_setup();
+
+ /*
+ * See if we want to use this console driver. If we
+ * didn't select a console we take the first one
+ * that registers here.
+ */
+ if (preferred_console < 0) {
+ if (console->index < 0)
+ console->index = 0;
+ if (console->setup == NULL ||
+ console->setup(console, NULL) == 0) {
+ console->flags |= CON_ENABLED;
+ if (console->device) {
+ console->flags |= CON_CONSDEV;
+ preferred_console = 0;
+ }
+ }
+ }
+
+ /*
+ * See if this console matches one we selected on
+ * the command line.
+ */
+ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+ i++) {
+ if (strcmp(console_cmdline[i].name, console->name) != 0)
+ continue;
+ if (console->index >= 0 &&
+ console->index != console_cmdline[i].index)
+ continue;
+ if (console->index < 0)
+ console->index = console_cmdline[i].index;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ if (console_cmdline[i].brl_options) {
+ console->flags |= CON_BRL;
+ braille_register_console(console,
+ console_cmdline[i].index,
+ console_cmdline[i].options,
+ console_cmdline[i].brl_options);
+ return;
+ }
+#endif
+ if (console->setup &&
+ console->setup(console, console_cmdline[i].options) != 0)
+ break;
+ console->flags |= CON_ENABLED;
+ console->index = console_cmdline[i].index;
+ if (i == selected_console) {
+ console->flags |= CON_CONSDEV;
+ preferred_console = selected_console;
+ }
+ break;
+ }
+
+ if (!(console->flags & CON_ENABLED))
+ return;
+
+ if (bootconsole && (console->flags & CON_CONSDEV)) {
+ printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+ bootconsole->name, bootconsole->index,
+ console->name, console->index);
+ unregister_console(bootconsole);
+ console->flags &= ~CON_PRINTBUFFER;
+ } else {
+ printk(KERN_INFO "console [%s%d] enabled\n",
+ console->name, console->index);
+ }
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ acquire_console_sem();
+ if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
+ console->next = console_drivers;
+ console_drivers = console;
+ if (console->next)
+ console->next->flags &= ~CON_CONSDEV;
+ } else {
+ console->next = console_drivers->next;
+ console_drivers->next = console;
+ }
+ if (console->flags & CON_PRINTBUFFER) {
+ /*
+ * release_console_sem() will print out the buffered messages
+ * for us.
+ */
+ spin_lock_irqsave(&logbuf_lock, flags);
+ con_start = log_start;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ }
+ release_console_sem();
+}
+EXPORT_SYMBOL(register_console);
+
+int unregister_console(struct console *console)
+{
+ struct console *a, *b;
+ int res = 1;
+
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ if (console->flags & CON_BRL)
+ return braille_unregister_console(console);
+#endif
+
+ acquire_console_sem();
+ if (console_drivers == console) {
+ console_drivers=console->next;
+ res = 0;
+ } else if (console_drivers) {
+ for (a=console_drivers->next, b=console_drivers ;
+ a; b=a, a=b->next) {
+ if (a == console) {
+ b->next = a->next;
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If this isn't the last console and it has CON_CONSDEV set, we
+ * need to set it on the next preferred console.
+ */
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
+ console_drivers->flags |= CON_CONSDEV;
+
+ release_console_sem();
+ return res;
+}
+EXPORT_SYMBOL(unregister_console);
+
+static int __init disable_boot_consoles(void)
+{
+ if (console_drivers != NULL) {
+ if (console_drivers->flags & CON_BOOT) {
+ printk(KERN_INFO "turn off boot console %s%d\n",
+ console_drivers->name, console_drivers->index);
+ return unregister_console(console_drivers);
+ }
+ }
+ return 0;
+}
+late_initcall(disable_boot_consoles);
+
+#if defined CONFIG_PRINTK
+
+/*
+ * printk rate limiting, lifted from the networking subsystem.
+ *
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
+ */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
+int printk_ratelimit(void)
+{
+ return __ratelimit(&printk_ratelimit_state);
+}
+EXPORT_SYMBOL(printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+ unsigned int interval_msecs)
+{
+ if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
+ *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
new file mode 100644
index 0000000..dc41827
--- /dev/null
+++ b/kernel/profile.c
@@ -0,0 +1,616 @@
+/*
+ * linux/kernel/profile.c
+ * Simple profiling. Manages a direct-mapped profile hit count buffer,
+ * with configurable resolution, support for restricting the cpus on
+ * which profiling is done, and switching between cpu time and
+ * schedule() calls via kernel command line parameters passed at boot.
+ *
+ * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
+ * Red Hat, July 2004
+ * Consolidation of architecture support code for profiling,
+ * William Irwin, Oracle, July 2004
+ * Amortized hit count accounting via per-cpu open-addressed hashtables
+ * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ */
+
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/sections.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+
+struct profile_hit {
+ u32 pc, hits;
+};
+#define PROFILE_GRPSHIFT 3
+#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
+#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
+#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
+
+/* Oprofile timer tick hook */
+static int (*timer_hook)(struct pt_regs *) __read_mostly;
+
+static atomic_t *prof_buffer;
+static unsigned long prof_len, prof_shift;
+
+int prof_on __read_mostly;
+EXPORT_SYMBOL_GPL(prof_on);
+
+static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
+static DEFINE_PER_CPU(int, cpu_profile_flip);
+static DEFINE_MUTEX(profile_flip_mutex);
+#endif /* CONFIG_SMP */
+
+int profile_setup(char *str)
+{
+ static char schedstr[] = "schedule";
+ static char sleepstr[] = "sleep";
+ static char kvmstr[] = "kvm";
+ int par;
+
+ if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+#ifdef CONFIG_SCHEDSTATS
+ prof_on = SLEEP_PROFILING;
+ if (str[strlen(sleepstr)] == ',')
+ str += strlen(sleepstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift);
+#else
+ printk(KERN_WARNING
+ "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+#endif /* CONFIG_SCHEDSTATS */
+ } else if (!strncmp(str, schedstr, strlen(schedstr))) {
+ prof_on = SCHED_PROFILING;
+ if (str[strlen(schedstr)] == ',')
+ str += strlen(schedstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
+ prof_on = KVM_PROFILING;
+ if (str[strlen(kvmstr)] == ',')
+ str += strlen(kvmstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel KVM profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (get_option(&str, &par)) {
+ prof_shift = par;
+ prof_on = CPU_PROFILING;
+ printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+ prof_shift);
+ }
+ return 1;
+}
+__setup("profile=", profile_setup);
+
+
+int __ref profile_init(void)
+{
+ int buffer_bytes;
+ if (!prof_on)
+ return 0;
+
+ /* only text is profiled */
+ prof_len = (_etext - _stext) >> prof_shift;
+ buffer_bytes = prof_len*sizeof(atomic_t);
+ if (!slab_is_available()) {
+ prof_buffer = alloc_bootmem(buffer_bytes);
+ return 0;
+ }
+
+ prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+ if (prof_buffer)
+ return 0;
+
+ prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+ if (prof_buffer)
+ return 0;
+
+ prof_buffer = vmalloc(buffer_bytes);
+ if (prof_buffer)
+ return 0;
+
+ return -ENOMEM;
+}
+
+/* Profile event notifications */
+
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
+
+void profile_task_exit(struct task_struct *task)
+{
+ blocking_notifier_call_chain(&task_exit_notifier, 0, task);
+}
+
+int profile_handoff_task(struct task_struct *task)
+{
+ int ret;
+ ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
+ return (ret == NOTIFY_OK) ? 1 : 0;
+}
+
+void profile_munmap(unsigned long addr)
+{
+ blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
+}
+
+int task_handoff_register(struct notifier_block *n)
+{
+ return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL_GPL(task_handoff_register);
+
+int task_handoff_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
+
+int profile_event_register(enum profile_type type, struct notifier_block *n)
+{
+ int err = -EINVAL;
+
+ switch (type) {
+ case PROFILE_TASK_EXIT:
+ err = blocking_notifier_chain_register(
+ &task_exit_notifier, n);
+ break;
+ case PROFILE_MUNMAP:
+ err = blocking_notifier_chain_register(
+ &munmap_notifier, n);
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(profile_event_register);
+
+int profile_event_unregister(enum profile_type type, struct notifier_block *n)
+{
+ int err = -EINVAL;
+
+ switch (type) {
+ case PROFILE_TASK_EXIT:
+ err = blocking_notifier_chain_unregister(
+ &task_exit_notifier, n);
+ break;
+ case PROFILE_MUNMAP:
+ err = blocking_notifier_chain_unregister(
+ &munmap_notifier, n);
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(profile_event_unregister);
+
+int register_timer_hook(int (*hook)(struct pt_regs *))
+{
+ if (timer_hook)
+ return -EBUSY;
+ timer_hook = hook;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_timer_hook);
+
+void unregister_timer_hook(int (*hook)(struct pt_regs *))
+{
+ WARN_ON(hook != timer_hook);
+ timer_hook = NULL;
+ /* make sure all CPUs see the NULL hook */
+ synchronize_sched(); /* Allow ongoing interrupts to complete. */
+}
+EXPORT_SYMBOL_GPL(unregister_timer_hook);
+
+
+#ifdef CONFIG_SMP
+/*
+ * Each cpu has a pair of open-addressed hashtables for pending
+ * profile hits. read_profile() IPI's all cpus to request them
+ * to flip buffers and flushes their contents to prof_buffer itself.
+ * Flip requests are serialized by the profile_flip_mutex. The sole
+ * use of having a second hashtable is for avoiding cacheline
+ * contention that would otherwise happen during flushes of pending
+ * profile hits required for the accuracy of reported profile hits
+ * and so resurrect the interrupt livelock issue.
+ *
+ * The open-addressed hashtables are indexed by profile buffer slot
+ * and hold the number of pending hits to that profile buffer slot on
+ * a cpu in an entry. When the hashtable overflows, all pending hits
+ * are accounted to their corresponding profile buffer slots with
+ * atomic_add() and the hashtable emptied. As numerous pending hits
+ * may be accounted to a profile buffer slot in a hashtable entry,
+ * this amortizes a number of atomic profile buffer increments likely
+ * to be far larger than the number of entries in the hashtable,
+ * particularly given that the number of distinct profile buffer
+ * positions to which hits are accounted during short intervals (e.g.
+ * several seconds) is usually very small. Exclusion from buffer
+ * flipping is provided by interrupt disablement (note that for
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
+ * The hash function is meant to be lightweight as opposed to strong,
+ * and was vaguely inspired by ppc64 firmware-supported inverted
+ * pagetable hash functions, but uses a full hashtable full of finite
+ * collision chains, not just pairs of them.
+ *
+ * -- wli
+ */
+static void __profile_flip_buffers(void *unused)
+{
+ int cpu = smp_processor_id();
+
+ per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
+}
+
+static void profile_flip_buffers(void)
+{
+ int i, j, cpu;
+
+ mutex_lock(&profile_flip_mutex);
+ j = per_cpu(cpu_profile_flip, get_cpu());
+ put_cpu();
+ on_each_cpu(__profile_flip_buffers, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
+ for (i = 0; i < NR_PROFILE_HIT; ++i) {
+ if (!hits[i].hits) {
+ if (hits[i].pc)
+ hits[i].pc = 0;
+ continue;
+ }
+ atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+ hits[i].hits = hits[i].pc = 0;
+ }
+ }
+ mutex_unlock(&profile_flip_mutex);
+}
+
+static void profile_discard_flip_buffers(void)
+{
+ int i, cpu;
+
+ mutex_lock(&profile_flip_mutex);
+ i = per_cpu(cpu_profile_flip, get_cpu());
+ put_cpu();
+ on_each_cpu(__profile_flip_buffers, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
+ memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
+ }
+ mutex_unlock(&profile_flip_mutex);
+}
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
+ int i, j, cpu;
+ struct profile_hit *hits;
+
+ if (prof_on != type || !prof_buffer)
+ return;
+ pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
+ i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+ secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+ cpu = get_cpu();
+ hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
+ if (!hits) {
+ put_cpu();
+ return;
+ }
+ /*
+ * We buffer the global profiler buffer into a per-CPU
+ * queue and thus reduce the number of global (and possibly
+ * NUMA-alien) accesses. The write-queue is self-coalescing:
+ */
+ local_irq_save(flags);
+ do {
+ for (j = 0; j < PROFILE_GRPSZ; ++j) {
+ if (hits[i + j].pc == pc) {
+ hits[i + j].hits += nr_hits;
+ goto out;
+ } else if (!hits[i + j].hits) {
+ hits[i + j].pc = pc;
+ hits[i + j].hits = nr_hits;
+ goto out;
+ }
+ }
+ i = (i + secondary) & (NR_PROFILE_HIT - 1);
+ } while (i != primary);
+
+ /*
+ * Add the current hit(s) and flush the write-queue out
+ * to the global buffer:
+ */
+ atomic_add(nr_hits, &prof_buffer[pc]);
+ for (i = 0; i < NR_PROFILE_HIT; ++i) {
+ atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+ hits[i].pc = hits[i].hits = 0;
+ }
+out:
+ local_irq_restore(flags);
+ put_cpu();
+}
+
+static int __cpuinit profile_cpu_callback(struct notifier_block *info,
+ unsigned long action, void *__cpu)
+{
+ int node, cpu = (unsigned long)__cpu;
+ struct page *page;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ node = cpu_to_node(cpu);
+ per_cpu(cpu_profile_flip, cpu) = 0;
+ if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO,
+ 0);
+ if (!page)
+ return NOTIFY_BAD;
+ per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
+ }
+ if (!per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO,
+ 0);
+ if (!page)
+ goto out_free;
+ per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
+ }
+ break;
+out_free:
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ return NOTIFY_BAD;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ cpu_set(cpu, prof_cpu_mask);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ cpu_clear(cpu, prof_cpu_mask);
+ if (per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+ per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ __free_page(page);
+ }
+ if (per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else /* !CONFIG_SMP */
+#define profile_flip_buffers() do { } while (0)
+#define profile_discard_flip_buffers() do { } while (0)
+#define profile_cpu_callback NULL
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ unsigned long pc;
+
+ if (prof_on != type || !prof_buffer)
+ return;
+ pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
+ atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+}
+#endif /* !CONFIG_SMP */
+EXPORT_SYMBOL_GPL(profile_hits);
+
+void profile_tick(int type)
+{
+ struct pt_regs *regs = get_irq_regs();
+
+ if (type == CPU_PROFILING && timer_hook)
+ timer_hook(regs);
+ if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+ profile_hit(type, (void *)profile_pc(regs));
+}
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+
+static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+ if (count - len < 2)
+ return -EINVAL;
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+static int prof_cpu_mask_write_proc(struct file *file,
+ const char __user *buffer, unsigned long count, void *data)
+{
+ cpumask_t *mask = (cpumask_t *)data;
+ unsigned long full_count = count, err;
+ cpumask_t new_value;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ return err;
+
+ *mask = new_value;
+ return full_count;
+}
+
+void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+{
+ struct proc_dir_entry *entry;
+
+ /* create /proc/irq/prof_cpu_mask */
+ entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
+ if (!entry)
+ return;
+ entry->data = (void *)&prof_cpu_mask;
+ entry->read_proc = prof_cpu_mask_read_proc;
+ entry->write_proc = prof_cpu_mask_write_proc;
+}
+
+/*
+ * This function accesses profiling information. The returned data is
+ * binary: the sampling step and the actual contents of the profile
+ * buffer. Use of the program readprofile is recommended in order to
+ * get meaningful info out of these data.
+ */
+static ssize_t
+read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos;
+ ssize_t read;
+ char *pnt;
+ unsigned int sample_step = 1 << prof_shift;
+
+ profile_flip_buffers();
+ if (p >= (prof_len+1)*sizeof(unsigned int))
+ return 0;
+ if (count > (prof_len+1)*sizeof(unsigned int) - p)
+ count = (prof_len+1)*sizeof(unsigned int) - p;
+ read = 0;
+
+ while (p < sizeof(unsigned int) && count > 0) {
+ if (put_user(*((char *)(&sample_step)+p), buf))
+ return -EFAULT;
+ buf++; p++; count--; read++;
+ }
+ pnt = (char *)prof_buffer + p - sizeof(atomic_t);
+ if (copy_to_user(buf, (void *)pnt, count))
+ return -EFAULT;
+ read += count;
+ *ppos += read;
+ return read;
+}
+
+/*
+ * Writing to /proc/profile resets the counters
+ *
+ * Writing a 'profiling multiplier' value into it also re-sets the profiling
+ * interrupt frequency, on architectures that support this.
+ */
+static ssize_t write_profile(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+ extern int setup_profiling_timer(unsigned int multiplier);
+
+ if (count == sizeof(int)) {
+ unsigned int multiplier;
+
+ if (copy_from_user(&multiplier, buf, sizeof(int)))
+ return -EFAULT;
+
+ if (setup_profiling_timer(multiplier))
+ return -EINVAL;
+ }
+#endif
+ profile_discard_flip_buffers();
+ memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
+ return count;
+}
+
+static const struct file_operations proc_profile_operations = {
+ .read = read_profile,
+ .write = write_profile,
+};
+
+#ifdef CONFIG_SMP
+static inline void profile_nop(void *unused)
+{
+}
+
+static int create_hash_tables(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ int node = cpu_to_node(cpu);
+ struct page *page;
+
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
+ if (!page)
+ goto out_cleanup;
+ per_cpu(cpu_profile_hits, cpu)[1]
+ = (struct profile_hit *)page_address(page);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
+ if (!page)
+ goto out_cleanup;
+ per_cpu(cpu_profile_hits, cpu)[0]
+ = (struct profile_hit *)page_address(page);
+ }
+ return 0;
+out_cleanup:
+ prof_on = 0;
+ smp_mb();
+ on_each_cpu(profile_nop, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct page *page;
+
+ if (per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+ per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ __free_page(page);
+ }
+ if (per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ }
+ }
+ return -1;
+}
+#else
+#define create_hash_tables() ({ 0; })
+#endif
+
+int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
+{
+ struct proc_dir_entry *entry;
+
+ if (!prof_on)
+ return 0;
+ if (create_hash_tables())
+ return -ENOMEM;
+ entry = proc_create("profile", S_IWUSR | S_IRUGO,
+ NULL, &proc_profile_operations);
+ if (!entry)
+ return 0;
+ entry->size = (1+prof_len) * sizeof(atomic_t);
+ hotcpu_notifier(profile_cpu_callback, 0);
+ return 0;
+}
+module_init(create_proc_profile);
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
new file mode 100644
index 0000000..9234a16
--- /dev/null
+++ b/kernel/ptrace.c
@@ -0,0 +1,712 @@
+/*
+ * linux/kernel/ptrace.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Common interfaces for "ptrace()" which we do not want
+ * to continually duplicate across every architecture.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/signal.h>
+#include <linux/audit.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+
+/*
+ * ptrace a task: make the debugger its new parent and
+ * move it to the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+}
+
+/*
+ * Turn a tracing stop into a normal stop now, since with no tracer there
+ * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
+ * signal sent that would resume the child, but didn't because it was in
+ * TASK_TRACED, resume it now.
+ * Requires that irqs be disabled.
+ */
+static void ptrace_untrace(struct task_struct *child)
+{
+ spin_lock(&child->sighand->siglock);
+ if (task_is_traced(child)) {
+ if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+ __set_task_state(child, TASK_STOPPED);
+ } else {
+ signal_wake_up(child, 1);
+ }
+ }
+ spin_unlock(&child->sighand->siglock);
+}
+
+/*
+ * unptrace a task: move it back to its original parent and
+ * remove it from the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_unlink(struct task_struct *child)
+{
+ BUG_ON(!child->ptrace);
+
+ child->ptrace = 0;
+ child->parent = child->real_parent;
+ list_del_init(&child->ptrace_entry);
+
+ if (task_is_traced(child))
+ ptrace_untrace(child);
+}
+
+/*
+ * Check that we have indeed attached to the thing..
+ */
+int ptrace_check_attach(struct task_struct *child, int kill)
+{
+ int ret = -ESRCH;
+
+ /*
+ * We take the read lock around doing both checks to close a
+ * possible race where someone else was tracing our child and
+ * detached between these two checks. After this locked check,
+ * we are sure that this is our traced child and that can only
+ * be changed by us so it's not changing right after this.
+ */
+ read_lock(&tasklist_lock);
+ if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+ ret = 0;
+ /*
+ * child->sighand can't be NULL, release_task()
+ * does ptrace_unlink() before __exit_signal().
+ */
+ spin_lock_irq(&child->sighand->siglock);
+ if (task_is_stopped(child))
+ child->state = TASK_TRACED;
+ else if (!task_is_traced(child) && !kill)
+ ret = -ESRCH;
+ spin_unlock_irq(&child->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+
+ if (!ret && !kill)
+ ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+
+ /* All systems go.. */
+ return ret;
+}
+
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+{
+ /* May we inspect the given task?
+ * This check is used both for attaching with ptrace
+ * and for allowing access to sensitive information in /proc.
+ *
+ * ptrace_attach denies several cases that /proc allows
+ * because setting up the necessary parent/child relationship
+ * or halting the specified task is impossible.
+ */
+ int dumpable = 0;
+ /* Don't let security modules deny introspection */
+ if (task == current)
+ return 0;
+ if (((current->uid != task->euid) ||
+ (current->uid != task->suid) ||
+ (current->uid != task->uid) ||
+ (current->gid != task->egid) ||
+ (current->gid != task->sgid) ||
+ (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+ smp_rmb();
+ if (task->mm)
+ dumpable = get_dumpable(task->mm);
+ if (!dumpable && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+
+ return security_ptrace_may_access(task, mode);
+}
+
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
+{
+ int err;
+ task_lock(task);
+ err = __ptrace_may_access(task, mode);
+ task_unlock(task);
+ return (!err ? true : false);
+}
+
+int ptrace_attach(struct task_struct *task)
+{
+ int retval;
+ unsigned long flags;
+
+ audit_ptrace(task);
+
+ retval = -EPERM;
+ if (same_thread_group(task, current))
+ goto out;
+
+repeat:
+ /*
+ * Nasty, nasty.
+ *
+ * We want to hold both the task-lock and the
+ * tasklist_lock for writing at the same time.
+ * But that's against the rules (tasklist_lock
+ * is taken for reading by interrupts on other
+ * cpu's that may have task_lock).
+ */
+ task_lock(task);
+ if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+ task_unlock(task);
+ do {
+ cpu_relax();
+ } while (!write_can_lock(&tasklist_lock));
+ goto repeat;
+ }
+
+ if (!task->mm)
+ goto bad;
+ /* the same process cannot be attached many times */
+ if (task->ptrace & PT_PTRACED)
+ goto bad;
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ if (retval)
+ goto bad;
+
+ /* Go */
+ task->ptrace |= PT_PTRACED;
+ if (capable(CAP_SYS_PTRACE))
+ task->ptrace |= PT_PTRACE_CAP;
+
+ __ptrace_link(task, current);
+
+ send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+bad:
+ write_unlock_irqrestore(&tasklist_lock, flags);
+ task_unlock(task);
+out:
+ return retval;
+}
+
+static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+ child->exit_code = data;
+ /* .. re-parent .. */
+ __ptrace_unlink(child);
+ /* .. and wake it up. */
+ if (child->exit_state != EXIT_ZOMBIE)
+ wake_up_process(child);
+}
+
+int ptrace_detach(struct task_struct *child, unsigned int data)
+{
+ if (!valid_signal(data))
+ return -EIO;
+
+ /* Architecture-specific hardware disable .. */
+ ptrace_disable(child);
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+ write_lock_irq(&tasklist_lock);
+ /* protect against de_thread()->release_task() */
+ if (child->ptrace)
+ __ptrace_detach(child, data);
+ write_unlock_irq(&tasklist_lock);
+
+ return 0;
+}
+
+int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ retval = access_process_vm(tsk, src, buf, this_len, 0);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ if (copy_to_user(dst, buf, retval))
+ return -EFAULT;
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
+
+int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ if (copy_from_user(buf, src, this_len))
+ return -EFAULT;
+ retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
+
+static int ptrace_setoptions(struct task_struct *child, long data)
+{
+ child->ptrace &= ~PT_TRACE_MASK;
+
+ if (data & PTRACE_O_TRACESYSGOOD)
+ child->ptrace |= PT_TRACESYSGOOD;
+
+ if (data & PTRACE_O_TRACEFORK)
+ child->ptrace |= PT_TRACE_FORK;
+
+ if (data & PTRACE_O_TRACEVFORK)
+ child->ptrace |= PT_TRACE_VFORK;
+
+ if (data & PTRACE_O_TRACECLONE)
+ child->ptrace |= PT_TRACE_CLONE;
+
+ if (data & PTRACE_O_TRACEEXEC)
+ child->ptrace |= PT_TRACE_EXEC;
+
+ if (data & PTRACE_O_TRACEVFORKDONE)
+ child->ptrace |= PT_TRACE_VFORK_DONE;
+
+ if (data & PTRACE_O_TRACEEXIT)
+ child->ptrace |= PT_TRACE_EXIT;
+
+ return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+}
+
+static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
+{
+ int error = -ESRCH;
+
+ read_lock(&tasklist_lock);
+ if (likely(child->sighand != NULL)) {
+ error = -EINVAL;
+ spin_lock_irq(&child->sighand->siglock);
+ if (likely(child->last_siginfo != NULL)) {
+ *info = *child->last_siginfo;
+ error = 0;
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+ return error;
+}
+
+static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
+{
+ int error = -ESRCH;
+
+ read_lock(&tasklist_lock);
+ if (likely(child->sighand != NULL)) {
+ error = -EINVAL;
+ spin_lock_irq(&child->sighand->siglock);
+ if (likely(child->last_siginfo != NULL)) {
+ *child->last_siginfo = *info;
+ error = 0;
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+ return error;
+}
+
+
+#ifdef PTRACE_SINGLESTEP
+#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
+#else
+#define is_singlestep(request) 0
+#endif
+
+#ifdef PTRACE_SINGLEBLOCK
+#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
+#else
+#define is_singleblock(request) 0
+#endif
+
+#ifdef PTRACE_SYSEMU
+#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
+#else
+#define is_sysemu_singlestep(request) 0
+#endif
+
+static int ptrace_resume(struct task_struct *child, long request, long data)
+{
+ if (!valid_signal(data))
+ return -EIO;
+
+ if (request == PTRACE_SYSCALL)
+ set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ else
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+#ifdef TIF_SYSCALL_EMU
+ if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
+ set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ else
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+
+ if (is_singleblock(request)) {
+ if (unlikely(!arch_has_block_step()))
+ return -EIO;
+ user_enable_block_step(child);
+ } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+ if (unlikely(!arch_has_single_step()))
+ return -EIO;
+ user_enable_single_step(child);
+ }
+ else
+ user_disable_single_step(child);
+
+ child->exit_code = data;
+ wake_up_process(child);
+
+ return 0;
+}
+
+int ptrace_request(struct task_struct *child, long request,
+ long addr, long data)
+{
+ int ret = -EIO;
+ siginfo_t siginfo;
+
+ switch (request) {
+ case PTRACE_PEEKTEXT:
+ case PTRACE_PEEKDATA:
+ return generic_ptrace_peekdata(child, addr, data);
+ case PTRACE_POKETEXT:
+ case PTRACE_POKEDATA:
+ return generic_ptrace_pokedata(child, addr, data);
+
+#ifdef PTRACE_OLDSETOPTIONS
+ case PTRACE_OLDSETOPTIONS:
+#endif
+ case PTRACE_SETOPTIONS:
+ ret = ptrace_setoptions(child, data);
+ break;
+ case PTRACE_GETEVENTMSG:
+ ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+ break;
+
+ case PTRACE_GETSIGINFO:
+ ret = ptrace_getsiginfo(child, &siginfo);
+ if (!ret)
+ ret = copy_siginfo_to_user((siginfo_t __user *) data,
+ &siginfo);
+ break;
+
+ case PTRACE_SETSIGINFO:
+ if (copy_from_user(&siginfo, (siginfo_t __user *) data,
+ sizeof siginfo))
+ ret = -EFAULT;
+ else
+ ret = ptrace_setsiginfo(child, &siginfo);
+ break;
+
+ case PTRACE_DETACH: /* detach a process that was attached. */
+ ret = ptrace_detach(child, data);
+ break;
+
+#ifdef PTRACE_SINGLESTEP
+ case PTRACE_SINGLESTEP:
+#endif
+#ifdef PTRACE_SINGLEBLOCK
+ case PTRACE_SINGLEBLOCK:
+#endif
+#ifdef PTRACE_SYSEMU
+ case PTRACE_SYSEMU:
+ case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+ case PTRACE_SYSCALL:
+ case PTRACE_CONT:
+ return ptrace_resume(child, request, data);
+
+ case PTRACE_KILL:
+ if (child->exit_state) /* already dead */
+ return 0;
+ return ptrace_resume(child, request, SIGKILL);
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * ptrace_traceme -- helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
+{
+ int ret = -EPERM;
+
+ /*
+ * Are we already being traced?
+ */
+repeat:
+ task_lock(current);
+ if (!(current->ptrace & PT_PTRACED)) {
+ /*
+ * See ptrace_attach() comments about the locking here.
+ */
+ unsigned long flags;
+ if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+ task_unlock(current);
+ do {
+ cpu_relax();
+ } while (!write_can_lock(&tasklist_lock));
+ goto repeat;
+ }
+
+ ret = security_ptrace_traceme(current->parent);
+
+ /*
+ * Set the ptrace bit in the process ptrace flags.
+ * Then link us on our parent's ptraced list.
+ */
+ if (!ret) {
+ current->ptrace |= PT_PTRACED;
+ __ptrace_link(current, current->real_parent);
+ }
+
+ write_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ task_unlock(current);
+ return ret;
+}
+
+/**
+ * ptrace_get_task_struct -- grab a task struct reference for ptrace
+ * @pid: process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations. It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+ struct task_struct *child;
+
+ read_lock(&tasklist_lock);
+ child = find_task_by_vpid(pid);
+ if (child)
+ get_task_struct(child);
+
+ read_unlock(&tasklist_lock);
+ if (!child)
+ return ERR_PTR(-ESRCH);
+ return child;
+}
+
+#ifndef arch_ptrace_attach
+#define arch_ptrace_attach(child) do { } while (0)
+#endif
+
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+{
+ struct task_struct *child;
+ long ret;
+
+ /*
+ * This lock_kernel fixes a subtle race with suid exec
+ */
+ lock_kernel();
+ if (request == PTRACE_TRACEME) {
+ ret = ptrace_traceme();
+ if (!ret)
+ arch_ptrace_attach(current);
+ goto out;
+ }
+
+ child = ptrace_get_task_struct(pid);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ goto out;
+ }
+
+ if (request == PTRACE_ATTACH) {
+ ret = ptrace_attach(child);
+ /*
+ * Some architectures need to do book-keeping after
+ * a ptrace attach.
+ */
+ if (!ret)
+ arch_ptrace_attach(child);
+ goto out_put_task_struct;
+ }
+
+ ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ if (ret < 0)
+ goto out_put_task_struct;
+
+ ret = arch_ptrace(child, request, addr, data);
+ if (ret < 0)
+ goto out_put_task_struct;
+
+ out_put_task_struct:
+ put_task_struct(child);
+ out:
+ unlock_kernel();
+ return ret;
+}
+
+int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+{
+ unsigned long tmp;
+ int copied;
+
+ copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ if (copied != sizeof(tmp))
+ return -EIO;
+ return put_user(tmp, (unsigned long __user *)data);
+}
+
+int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+{
+ int copied;
+
+ copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ return (copied == sizeof(data)) ? 0 : -EIO;
+}
+
+#if defined CONFIG_COMPAT
+#include <linux/compat.h>
+
+int compat_ptrace_request(struct task_struct *child, compat_long_t request,
+ compat_ulong_t addr, compat_ulong_t data)
+{
+ compat_ulong_t __user *datap = compat_ptr(data);
+ compat_ulong_t word;
+ siginfo_t siginfo;
+ int ret;
+
+ switch (request) {
+ case PTRACE_PEEKTEXT:
+ case PTRACE_PEEKDATA:
+ ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ if (ret != sizeof(word))
+ ret = -EIO;
+ else
+ ret = put_user(word, datap);
+ break;
+
+ case PTRACE_POKETEXT:
+ case PTRACE_POKEDATA:
+ ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = (ret != sizeof(data) ? -EIO : 0);
+ break;
+
+ case PTRACE_GETEVENTMSG:
+ ret = put_user((compat_ulong_t) child->ptrace_message, datap);
+ break;
+
+ case PTRACE_GETSIGINFO:
+ ret = ptrace_getsiginfo(child, &siginfo);
+ if (!ret)
+ ret = copy_siginfo_to_user32(
+ (struct compat_siginfo __user *) datap,
+ &siginfo);
+ break;
+
+ case PTRACE_SETSIGINFO:
+ memset(&siginfo, 0, sizeof siginfo);
+ if (copy_siginfo_from_user32(
+ &siginfo, (struct compat_siginfo __user *) datap))
+ ret = -EFAULT;
+ else
+ ret = ptrace_setsiginfo(child, &siginfo);
+ break;
+
+ default:
+ ret = ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+ compat_long_t addr, compat_long_t data)
+{
+ struct task_struct *child;
+ long ret;
+
+ /*
+ * This lock_kernel fixes a subtle race with suid exec
+ */
+ lock_kernel();
+ if (request == PTRACE_TRACEME) {
+ ret = ptrace_traceme();
+ goto out;
+ }
+
+ child = ptrace_get_task_struct(pid);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ goto out;
+ }
+
+ if (request == PTRACE_ATTACH) {
+ ret = ptrace_attach(child);
+ /*
+ * Some architectures need to do book-keeping after
+ * a ptrace attach.
+ */
+ if (!ret)
+ arch_ptrace_attach(child);
+ goto out_put_task_struct;
+ }
+
+ ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ if (!ret)
+ ret = compat_arch_ptrace(child, request, addr, data);
+
+ out_put_task_struct:
+ put_task_struct(child);
+ out:
+ unlock_kernel();
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 0000000..37f72e5
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,786 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+ .cur = -300,
+ .completed = -300,
+ .pending = -300,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+ .cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .cur = -300,
+ .completed = -300,
+ .pending = -300,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+ .cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+ struct rcu_ctrlblk *rcp)
+{
+ int cpu;
+ cpumask_t cpumask;
+ unsigned long flags;
+
+ set_need_resched();
+ spin_lock_irqsave(&rcp->lock, flags);
+ if (unlikely(!rcp->signaled)) {
+ rcp->signaled = 1;
+ /*
+ * Don't send IPI to itself. With irqs disabled,
+ * rdp->cpu is the current cpu.
+ *
+ * cpu_online_map is updated by the _cpu_down()
+ * using __stop_machine(). Since we're in irqs disabled
+ * section, __stop_machine() is not exectuting, hence
+ * the cpu_online_map is stable.
+ *
+ * However, a cpu might have been offlined _just_ before
+ * we disabled irqs while entering here.
+ * And rcu subsystem might not yet have handled the CPU_DEAD
+ * notification, leading to the offlined cpu's bit
+ * being set in the rcp->cpumask.
+ *
+ * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+ * sending smp_reschedule() to an offlined CPU.
+ */
+ cpus_and(cpumask, rcp->cpumask, cpu_online_map);
+ cpu_clear(rdp->cpu, cpumask);
+ for_each_cpu_mask_nr(cpu, cpumask)
+ smp_send_reschedule(cpu);
+ }
+ spin_unlock_irqrestore(&rcp->lock, flags);
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+ struct rcu_ctrlblk *rcp)
+{
+ set_need_resched();
+}
+#endif
+
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+ struct rcu_data *rdp)
+{
+ long batch;
+
+ head->next = NULL;
+ smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+
+ /*
+ * Determine the batch number of this callback.
+ *
+ * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+ * local variable "batch" and emits codes like this:
+ * 1) rdp->batch = rcp->cur + 1 # gets old value
+ * ......
+ * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+ * then [*nxttail[0], *nxttail[1]) may contain callbacks
+ * that batch# = rdp->batch, see the comment of struct rcu_data.
+ */
+ batch = ACCESS_ONCE(rcp->cur) + 1;
+
+ if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+ /* process callbacks */
+ rdp->nxttail[0] = rdp->nxttail[1];
+ rdp->nxttail[1] = rdp->nxttail[2];
+ if (rcu_batch_after(batch - 1, rdp->batch))
+ rdp->nxttail[0] = rdp->nxttail[2];
+ }
+
+ rdp->batch = batch;
+ *rdp->nxttail[2] = head;
+ rdp->nxttail[2] = &head->next;
+
+ if (unlikely(++rdp->qlen > qhimark)) {
+ rdp->blimit = INT_MAX;
+ force_quiescent_state(rdp, &rcu_ctrlblk);
+ }
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+ rcp->gp_start = jiffies;
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ delta = jiffies - rcp->jiffies_stall;
+ if (delta < 2 || rcp->cur != rcp->completed) {
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ return;
+ }
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+
+ /* OK, time to rat on our buddy... */
+
+ printk(KERN_ERR "RCU detected CPU stalls:");
+ for_each_possible_cpu(cpu) {
+ if (cpu_isset(cpu, rcp->cpumask))
+ printk(" %d", cpu);
+ }
+ printk(" (detected by %d, t=%ld jiffies)\n",
+ smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long flags;
+
+ printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+ smp_processor_id(), jiffies,
+ jiffies - rcp->gp_start);
+ dump_stack();
+ spin_lock_irqsave(&rcp->lock, flags);
+ if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+ rcp->jiffies_stall =
+ jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ set_need_resched(); /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ long delta;
+
+ delta = jiffies - rcp->jiffies_stall;
+ if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rcp);
+
+ } else if (rcp->cur != rcp->completed && delta >= 2) {
+
+ /* They had two seconds to dump stack, so complain. */
+ print_other_cpu_stall(rcp);
+ }
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+
+ head->func = func;
+ local_irq_save(flags);
+ __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void call_rcu_bh(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+
+ head->func = func;
+ local_irq_save(flags);
+ __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+ return rcu_bh_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+ raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_head *next, *list;
+ int count = 0;
+
+ list = rdp->donelist;
+ while (list) {
+ next = list->next;
+ prefetch(next);
+ list->func(list);
+ list = next;
+ if (++count >= rdp->blimit)
+ break;
+ }
+ rdp->donelist = list;
+
+ local_irq_save(flags);
+ rdp->qlen -= count;
+ local_irq_restore(flags);
+ if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+ rdp->blimit = blimit;
+
+ if (!rdp->donelist)
+ rdp->donetail = &rdp->donelist;
+ else
+ raise_rcu_softirq();
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ * This is done by rcu_start_batch. The start is not broadcasted to
+ * all cpus, they must pick this up by comparing rcp->cur with
+ * rdp->quiescbatch. All cpus are recorded in the
+ * rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ * Since the start of the grace period is not broadcasted, at least two
+ * calls to rcu_check_quiescent_state are required:
+ * The first call just notices that a new grace period is running. The
+ * following calls check if there was a quiescent state since the beginning
+ * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ * the bitmap is empty, then the grace period is completed.
+ * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ * period (if necessary).
+ */
+
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+ if (rcp->cur != rcp->pending &&
+ rcp->completed == rcp->cur) {
+ rcp->cur++;
+ record_gp_stall_check_time(rcp);
+
+ /*
+ * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+ * Barrier Otherwise it can cause tickless idle CPUs to be
+ * included in rcp->cpumask, which will extend graceperiods
+ * unnecessarily.
+ */
+ smp_mb();
+ cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+ rcp->signaled = 0;
+ }
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+ cpu_clear(cpu, rcp->cpumask);
+ if (cpus_empty(rcp->cpumask)) {
+ /* batch completed ! */
+ rcp->completed = rcp->cur;
+ rcu_start_batch(rcp);
+ }
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+ struct rcu_data *rdp)
+{
+ unsigned long flags;
+
+ if (rdp->quiescbatch != rcp->cur) {
+ /* start new grace period: */
+ rdp->qs_pending = 1;
+ rdp->passed_quiesc = 0;
+ rdp->quiescbatch = rcp->cur;
+ return;
+ }
+
+ /* Grace period already completed for this cpu?
+ * qs_pending is checked instead of the actual bitmap to avoid
+ * cacheline trashing.
+ */
+ if (!rdp->qs_pending)
+ return;
+
+ /*
+ * Was there a quiescent state since the beginning of the grace
+ * period? If no, then exit and wait for the next call.
+ */
+ if (!rdp->passed_quiesc)
+ return;
+ rdp->qs_pending = 0;
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ /*
+ * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+ * during cpu startup. Ignore the quiescent state.
+ */
+ if (likely(rdp->quiescbatch == rcp->cur))
+ cpu_quiet(rdp->cpu, rcp);
+
+ spin_unlock_irqrestore(&rcp->lock, flags);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+ struct rcu_head **tail, long batch)
+{
+ unsigned long flags;
+
+ if (list) {
+ local_irq_save(flags);
+ this_rdp->batch = batch;
+ *this_rdp->nxttail[2] = list;
+ this_rdp->nxttail[2] = tail;
+ local_irq_restore(flags);
+ }
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+ struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+
+ /*
+ * if the cpu going offline owns the grace period
+ * we can block indefinitely waiting for it, so flush
+ * it here
+ */
+ spin_lock_irqsave(&rcp->lock, flags);
+ if (rcp->cur != rcp->completed)
+ cpu_quiet(rdp->cpu, rcp);
+ rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
+ rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
+ spin_unlock(&rcp->lock);
+
+ this_rdp->qlen += rdp->qlen;
+ local_irq_restore(flags);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+ struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+ struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+ __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+ &per_cpu(rcu_data, cpu));
+ __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+ &per_cpu(rcu_bh_data, cpu));
+ put_cpu_var(rcu_data);
+ put_cpu_var(rcu_bh_data);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from softirq context.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+ struct rcu_data *rdp)
+{
+ unsigned long flags;
+ long completed_snap;
+
+ if (rdp->nxtlist) {
+ local_irq_save(flags);
+ completed_snap = ACCESS_ONCE(rcp->completed);
+
+ /*
+ * move the other grace-period-completed entries to
+ * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
+ */
+ if (!rcu_batch_before(completed_snap, rdp->batch))
+ rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+ else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
+ rdp->nxttail[0] = rdp->nxttail[1];
+
+ /*
+ * the grace period for entries in
+ * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
+ * move these entries to donelist
+ */
+ if (rdp->nxttail[0] != &rdp->nxtlist) {
+ *rdp->donetail = rdp->nxtlist;
+ rdp->donetail = rdp->nxttail[0];
+ rdp->nxtlist = *rdp->nxttail[0];
+ *rdp->donetail = NULL;
+
+ if (rdp->nxttail[1] == rdp->nxttail[0])
+ rdp->nxttail[1] = &rdp->nxtlist;
+ if (rdp->nxttail[2] == rdp->nxttail[0])
+ rdp->nxttail[2] = &rdp->nxtlist;
+ rdp->nxttail[0] = &rdp->nxtlist;
+ }
+
+ local_irq_restore(flags);
+
+ if (rcu_batch_after(rdp->batch, rcp->pending)) {
+ unsigned long flags2;
+
+ /* and start it/schedule start if it's a new batch */
+ spin_lock_irqsave(&rcp->lock, flags2);
+ if (rcu_batch_after(rdp->batch, rcp->pending)) {
+ rcp->pending = rdp->batch;
+ rcu_start_batch(rcp);
+ }
+ spin_unlock_irqrestore(&rcp->lock, flags2);
+ }
+ }
+
+ rcu_check_quiescent_state(rcp, rdp);
+ if (rdp->donelist)
+ rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ /*
+ * Memory references from any prior RCU read-side critical sections
+ * executed by the interrupted code must be see before any RCU
+ * grace-period manupulations below.
+ */
+
+ smp_mb(); /* See above block comment. */
+
+ __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+ __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+
+ /*
+ * Memory references from any later RCU read-side critical sections
+ * executed by the interrupted code must be see after any RCU
+ * grace-period manupulations above.
+ */
+
+ smp_mb(); /* See above block comment. */
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+ /* Check for CPU stalls, if enabled. */
+ check_cpu_stall(rcp);
+
+ if (rdp->nxtlist) {
+ long completed_snap = ACCESS_ONCE(rcp->completed);
+
+ /*
+ * This cpu has pending rcu entries and the grace period
+ * for them has completed.
+ */
+ if (!rcu_batch_before(completed_snap, rdp->batch))
+ return 1;
+ if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
+ rdp->nxttail[0] != rdp->nxttail[1])
+ return 1;
+ if (rdp->nxttail[0] != &rdp->nxtlist)
+ return 1;
+
+ /*
+ * This cpu has pending rcu entries and the new batch
+ * for then hasn't been started nor scheduled start
+ */
+ if (rcu_batch_after(rdp->batch, rcp->pending))
+ return 1;
+ }
+
+ /* This cpu has finished callbacks to invoke */
+ if (rdp->donelist)
+ return 1;
+
+ /* The rcu core waits for a quiescent state from the cpu */
+ if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+ return 1;
+
+ /* nothing to do */
+ return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so. This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+ return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+ __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so. This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+ return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
+}
+
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt. This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+ if (user ||
+ (idle_cpu(cpu) && !in_softirq() &&
+ hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+ /*
+ * Get here if this CPU took its interrupt from user
+ * mode or from the idle loop, and if this is not a
+ * nested interrupt. In this case, the CPU is in
+ * a quiescent state, so count it.
+ *
+ * Also do a memory barrier. This is needed to handle
+ * the case where writes from a preempt-disable section
+ * of code get reordered into schedule() by this CPU's
+ * write buffer. The memory barrier makes sure that
+ * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+ * by other CPUs to happen after any such write.
+ */
+
+ smp_mb(); /* See above block comment. */
+ rcu_qsctr_inc(cpu);
+ rcu_bh_qsctr_inc(cpu);
+
+ } else if (!in_softirq()) {
+
+ /*
+ * Get here if this CPU did not take its interrupt from
+ * softirq, in other words, if it is not interrupting
+ * a rcu_bh read-side critical section. This is an _bh
+ * critical section, so count it. The memory barrier
+ * is needed for the same reason as is the above one.
+ */
+
+ smp_mb(); /* See above block comment. */
+ rcu_bh_qsctr_inc(cpu);
+ }
+ raise_rcu_softirq();
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+ struct rcu_data *rdp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ memset(rdp, 0, sizeof(*rdp));
+ rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
+ rdp->donetail = &rdp->donelist;
+ rdp->quiescbatch = rcp->completed;
+ rdp->qs_pending = 0;
+ rdp->cpu = cpu;
+ rdp->blimit = blimit;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+ rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+ rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ rcu_online_cpu(cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ rcu_offline_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+ .notifier_call = rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism. Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init __rcu_init(void)
+{
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+ /* Register notifier for non-boot CPUs */
+ register_cpu_notifier(&rcu_nb);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
new file mode 100644
index 0000000..ad63af8
--- /dev/null
+++ b/kernel/rcupdate.c
@@ -0,0 +1,170 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+
+enum rcu_barrier {
+ RCU_BARRIER_STD,
+ RCU_BARRIER_BH,
+ RCU_BARRIER_SCHED,
+};
+
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
+void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void); /* Makes kernel-doc tools happy */
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static void rcu_barrier_callback(struct rcu_head *notused)
+{
+ if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ complete(&rcu_barrier_completion);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *type)
+{
+ int cpu = smp_processor_id();
+ struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+
+ atomic_inc(&rcu_barrier_cpu_count);
+ switch ((enum rcu_barrier)type) {
+ case RCU_BARRIER_STD:
+ call_rcu(head, rcu_barrier_callback);
+ break;
+ case RCU_BARRIER_BH:
+ call_rcu_bh(head, rcu_barrier_callback);
+ break;
+ case RCU_BARRIER_SCHED:
+ call_rcu_sched(head, rcu_barrier_callback);
+ break;
+ }
+}
+
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
+ */
+static void _rcu_barrier(enum rcu_barrier type)
+{
+ BUG_ON(in_interrupt());
+ /* Take cpucontrol mutex to protect against CPU hotplug */
+ mutex_lock(&rcu_barrier_mutex);
+ init_completion(&rcu_barrier_completion);
+ /*
+ * Initialize rcu_barrier_cpu_count to 1, then invoke
+ * rcu_barrier_func() on each CPU, so that each CPU also has
+ * incremented rcu_barrier_cpu_count. Only then is it safe to
+ * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+ * might complete its grace period before all of the other CPUs
+ * did their increment, causing this function to return too
+ * early.
+ */
+ atomic_set(&rcu_barrier_cpu_count, 1);
+ on_each_cpu(rcu_barrier_func, (void *)type, 1);
+ if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ complete(&rcu_barrier_completion);
+ wait_for_completion(&rcu_barrier_completion);
+ mutex_unlock(&rcu_barrier_mutex);
+}
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+ _rcu_barrier(RCU_BARRIER_STD);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+ _rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+ _rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+void __init rcu_init(void)
+{
+ __rcu_init();
+}
+
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 0000000..59236e8
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,1482 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ * for pushing me away from locks and towards counters, and
+ * to Suparna Bhattacharya for pushing me completely away
+ * from atomic instructions on the read side.
+ *
+ * - Added handling of Dynamic Ticks
+ * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
+ * - Steven Rostedt <srostedt@redhat.com>
+ *
+ * Papers: http://www.rdrop.com/users/paulmck/RCU
+ *
+ * Design Document: http://lwn.net/Articles/253651/
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/cpumask.h>
+#include <linux/rcupreempt_trace.h>
+#include <asm/byteorder.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+/*
+ * GP_STAGES specifies the number of times the state machine has
+ * to go through the all the rcu_try_flip_states (see below)
+ * in a single Grace Period.
+ *
+ * GP in GP_STAGES stands for Grace Period ;)
+ */
+#define GP_STAGES 2
+struct rcu_data {
+ spinlock_t lock; /* Protect rcu_data fields. */
+ long completed; /* Number of last completed batch. */
+ int waitlistcount;
+ struct rcu_head *nextlist;
+ struct rcu_head **nexttail;
+ struct rcu_head *waitlist[GP_STAGES];
+ struct rcu_head **waittail[GP_STAGES];
+ struct rcu_head *donelist; /* from waitlist & waitschedlist */
+ struct rcu_head **donetail;
+ long rcu_flipctr[2];
+ struct rcu_head *nextschedlist;
+ struct rcu_head **nextschedtail;
+ struct rcu_head *waitschedlist;
+ struct rcu_head **waitschedtail;
+ int rcu_sched_sleeping;
+#ifdef CONFIG_RCU_TRACE
+ struct rcupreempt_trace trace;
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+
+enum rcu_try_flip_states {
+
+ /*
+ * Stay here if nothing is happening. Flip the counter if somthing
+ * starts happening. Denoted by "I"
+ */
+ rcu_try_flip_idle_state,
+
+ /*
+ * Wait here for all CPUs to notice that the counter has flipped. This
+ * prevents the old set of counters from ever being incremented once
+ * we leave this state, which in turn is necessary because we cannot
+ * test any individual counter for zero -- we can only check the sum.
+ * Denoted by "A".
+ */
+ rcu_try_flip_waitack_state,
+
+ /*
+ * Wait here for the sum of the old per-CPU counters to reach zero.
+ * Denoted by "Z".
+ */
+ rcu_try_flip_waitzero_state,
+
+ /*
+ * Wait here for each of the other CPUs to execute a memory barrier.
+ * This is necessary to ensure that these other CPUs really have
+ * completed executing their RCU read-side critical sections, despite
+ * their CPUs wildly reordering memory. Denoted by "M".
+ */
+ rcu_try_flip_waitmb_state,
+};
+
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+
+enum rcu_sched_sleep_states {
+ rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
+ rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
+ rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
+};
+
+struct rcu_ctrlblk {
+ spinlock_t fliplock; /* Protect state-machine transitions. */
+ long completed; /* Number of last completed batch. */
+ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
+ the rcu state machine */
+ spinlock_t schedlock; /* Protect rcu_sched sleep state. */
+ enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+ wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
+};
+
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_ctrlblk rcu_ctrlblk = {
+ .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+ .completed = 0,
+ .rcu_try_flip_state = rcu_try_flip_idle_state,
+ .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+ .sched_sleep = rcu_sched_not_sleeping,
+ .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
+};
+
+static struct task_struct *rcu_sched_grace_period_task;
+
+#ifdef CONFIG_RCU_TRACE
+static char *rcu_try_flip_state_names[] =
+ { "idle", "waitack", "waitzero", "waitmb" };
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+
+enum rcu_flip_flag_values {
+ rcu_flip_seen, /* Steady/initial state, last flip seen. */
+ /* Only GP detector can update. */
+ rcu_flipped /* Flip just completed, need confirmation. */
+ /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
+ = rcu_flip_seen;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+
+enum rcu_mb_flag_values {
+ rcu_mb_done, /* Steady/initial state, no mb()s required. */
+ /* Only GP detector can update. */
+ rcu_mb_needed /* Flip just completed, need an mb(). */
+ /* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
+ = rcu_mb_done;
+
+/*
+ * RCU_DATA_ME: find the current CPU's rcu_data structure.
+ * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
+ */
+#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
+#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable, but where the CPU number is so cached.
+ */
+#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable.
+ */
+#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is pointed
+ * to by a local variable.
+ */
+#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
+
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+void __rcu_read_lock(void)
+{
+ int idx;
+ struct task_struct *t = current;
+ int nesting;
+
+ nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+ if (nesting != 0) {
+
+ /* An earlier rcu_read_lock() covers us, just count it. */
+
+ t->rcu_read_lock_nesting = nesting + 1;
+
+ } else {
+ unsigned long flags;
+
+ /*
+ * We disable interrupts for the following reasons:
+ * - If we get scheduling clock interrupt here, and we
+ * end up acking the counter flip, it's like a promise
+ * that we will never increment the old counter again.
+ * Thus we will break that promise if that
+ * scheduling clock interrupt happens between the time
+ * we pick the .completed field and the time that we
+ * increment our counter.
+ *
+ * - We don't want to be preempted out here.
+ *
+ * NMIs can still occur, of course, and might themselves
+ * contain rcu_read_lock().
+ */
+
+ local_irq_save(flags);
+
+ /*
+ * Outermost nesting of rcu_read_lock(), so increment
+ * the current counter for the current CPU. Use volatile
+ * casts to prevent the compiler from reordering.
+ */
+
+ idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
+ ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
+
+ /*
+ * Now that the per-CPU counter has been incremented, we
+ * are protected from races with rcu_read_lock() invoked
+ * from NMI handlers on this CPU. We can therefore safely
+ * increment the nesting counter, relieving further NMIs
+ * of the need to increment the per-CPU counter.
+ */
+
+ ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
+
+ /*
+ * Now that we have preventing any NMIs from storing
+ * to the ->rcu_flipctr_idx, we can safely use it to
+ * remember which counter to decrement in the matching
+ * rcu_read_unlock().
+ */
+
+ ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+void __rcu_read_unlock(void)
+{
+ int idx;
+ struct task_struct *t = current;
+ int nesting;
+
+ nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+ if (nesting > 1) {
+
+ /*
+ * We are still protected by the enclosing rcu_read_lock(),
+ * so simply decrement the counter.
+ */
+
+ t->rcu_read_lock_nesting = nesting - 1;
+
+ } else {
+ unsigned long flags;
+
+ /*
+ * Disable local interrupts to prevent the grace-period
+ * detection state machine from seeing us half-done.
+ * NMIs can still occur, of course, and might themselves
+ * contain rcu_read_lock() and rcu_read_unlock().
+ */
+
+ local_irq_save(flags);
+
+ /*
+ * Outermost nesting of rcu_read_unlock(), so we must
+ * decrement the current counter for the current CPU.
+ * This must be done carefully, because NMIs can
+ * occur at any point in this code, and any rcu_read_lock()
+ * and rcu_read_unlock() pairs in the NMI handlers
+ * must interact non-destructively with this code.
+ * Lots of volatile casts, and -very- careful ordering.
+ *
+ * Changes to this code, including this one, must be
+ * inspected, validated, and tested extremely carefully!!!
+ */
+
+ /*
+ * First, pick up the index.
+ */
+
+ idx = ACCESS_ONCE(t->rcu_flipctr_idx);
+
+ /*
+ * Now that we have fetched the counter index, it is
+ * safe to decrement the per-task RCU nesting counter.
+ * After this, any interrupts or NMIs will increment and
+ * decrement the per-CPU counters.
+ */
+ ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
+
+ /*
+ * It is now safe to decrement this task's nesting count.
+ * NMIs that occur after this statement will route their
+ * rcu_read_lock() calls through this "else" clause, and
+ * will thus start incrementing the per-CPU counter on
+ * their own. They will also clobber ->rcu_flipctr_idx,
+ * but that is OK, since we have already fetched it.
+ */
+
+ ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * If a global counter flip has occurred since the last time that we
+ * advanced callbacks, advance them. Hardware interrupts must be
+ * disabled when calling this function.
+ */
+static void __rcu_advance_callbacks(struct rcu_data *rdp)
+{
+ int cpu;
+ int i;
+ int wlc = 0;
+
+ if (rdp->completed != rcu_ctrlblk.completed) {
+ if (rdp->waitlist[GP_STAGES - 1] != NULL) {
+ *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
+ rdp->donetail = rdp->waittail[GP_STAGES - 1];
+ RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
+ }
+ for (i = GP_STAGES - 2; i >= 0; i--) {
+ if (rdp->waitlist[i] != NULL) {
+ rdp->waitlist[i + 1] = rdp->waitlist[i];
+ rdp->waittail[i + 1] = rdp->waittail[i];
+ wlc++;
+ } else {
+ rdp->waitlist[i + 1] = NULL;
+ rdp->waittail[i + 1] =
+ &rdp->waitlist[i + 1];
+ }
+ }
+ if (rdp->nextlist != NULL) {
+ rdp->waitlist[0] = rdp->nextlist;
+ rdp->waittail[0] = rdp->nexttail;
+ wlc++;
+ rdp->nextlist = NULL;
+ rdp->nexttail = &rdp->nextlist;
+ RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
+ } else {
+ rdp->waitlist[0] = NULL;
+ rdp->waittail[0] = &rdp->waitlist[0];
+ }
+ rdp->waitlistcount = wlc;
+ rdp->completed = rcu_ctrlblk.completed;
+ }
+
+ /*
+ * Check to see if this CPU needs to report that it has seen
+ * the most recent counter flip, thereby declaring that all
+ * subsequent rcu_read_lock() invocations will respect this flip.
+ */
+
+ cpu = raw_smp_processor_id();
+ if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+ smp_mb(); /* Subsequent counter accesses must see new value */
+ per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+ smp_mb(); /* Subsequent RCU read-side critical sections */
+ /* seen -after- acknowledgement. */
+ }
+}
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+ .dynticks = 1,
+};
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_PER_CPU(int, rcu_update_flag);
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
+ * CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+ int cpu = smp_processor_id();
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ if (per_cpu(rcu_update_flag, cpu))
+ per_cpu(rcu_update_flag, cpu)++;
+
+ /*
+ * Only update if we are coming from a stopped ticks mode
+ * (rcu_dyntick_sched.dynticks is even).
+ */
+ if (!in_interrupt() &&
+ (rdssp->dynticks & 0x1) == 0) {
+ /*
+ * The following might seem like we could have a race
+ * with NMI/SMIs. But this really isn't a problem.
+ * Here we do a read/modify/write, and the race happens
+ * when an NMI/SMI comes in after the read and before
+ * the write. But NMI/SMIs will increment this counter
+ * twice before returning, so the zero bit will not
+ * be corrupted by the NMI/SMI which is the most important
+ * part.
+ *
+ * The only thing is that we would bring back the counter
+ * to a postion that it was in during the NMI/SMI.
+ * But the zero bit would be set, so the rest of the
+ * counter would again be ignored.
+ *
+ * On return from the IRQ, the counter may have the zero
+ * bit be 0 and the counter the same as the return from
+ * the NMI/SMI. If the state machine was so unlucky to
+ * see that, it still doesn't matter, since all
+ * RCU read-side critical sections on this CPU would
+ * have already completed.
+ */
+ rdssp->dynticks++;
+ /*
+ * The following memory barrier ensures that any
+ * rcu_read_lock() primitives in the irq handler
+ * are seen by other CPUs to follow the above
+ * increment to rcu_dyntick_sched.dynticks. This is
+ * required in order for other CPUs to correctly
+ * determine when it is safe to advance the RCU
+ * grace-period state machine.
+ */
+ smp_mb(); /* see above block comment. */
+ /*
+ * Since we can't determine the dynamic tick mode from
+ * the rcu_dyntick_sched.dynticks after this routine,
+ * we use a second flag to acknowledge that we came
+ * from an idle state with ticks stopped.
+ */
+ per_cpu(rcu_update_flag, cpu)++;
+ /*
+ * If we take an NMI/SMI now, they will also increment
+ * the rcu_update_flag, and will not update the
+ * rcu_dyntick_sched.dynticks on exit. That is for
+ * this IRQ to do.
+ */
+ }
+}
+
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
+ * aware that the CPU is going back to idle with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+ int cpu = smp_processor_id();
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ /*
+ * rcu_update_flag is set if we interrupted the CPU
+ * when it was idle with ticks stopped.
+ * Once this occurs, we keep track of interrupt nesting
+ * because a NMI/SMI could also come in, and we still
+ * only want the IRQ that started the increment of the
+ * rcu_dyntick_sched.dynticks to be the one that modifies
+ * it on exit.
+ */
+ if (per_cpu(rcu_update_flag, cpu)) {
+ if (--per_cpu(rcu_update_flag, cpu))
+ return;
+
+ /* This must match the interrupt nesting */
+ WARN_ON(in_interrupt());
+
+ /*
+ * If an NMI/SMI happens now we are still
+ * protected by the rcu_dyntick_sched.dynticks being odd.
+ */
+
+ /*
+ * The following memory barrier ensures that any
+ * rcu_read_unlock() primitives in the irq handler
+ * are seen by other CPUs to preceed the following
+ * increment to rcu_dyntick_sched.dynticks. This
+ * is required in order for other CPUs to determine
+ * when it is safe to advance the RCU grace-period
+ * state machine.
+ */
+ smp_mb(); /* see above block comment. */
+ rdssp->dynticks++;
+ WARN_ON(rdssp->dynticks & 0x1);
+ }
+}
+
+static void dyntick_save_progress_counter(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->dynticks_snap = rdssp->dynticks;
+}
+
+static inline int
+rcu_try_flip_waitack_needed(int cpu)
+{
+ long curr;
+ long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ curr = rdssp->dynticks;
+ snap = rdssp->dynticks_snap;
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot be in the middle of an rcu_read_lock(), so
+ * the next rcu_read_lock() it executes must use the new value
+ * of the counter. So we can safely pretend that this CPU
+ * already acknowledged the counter.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq handlers, then, as above, we can safely pretend
+ * that this CPU already acknowledged the counter.
+ */
+
+ if ((curr - snap) > 2 || (curr & 0x1) == 0)
+ return 0;
+
+ /* We need this CPU to explicitly acknowledge the counter flip. */
+
+ return 1;
+}
+
+static inline int
+rcu_try_flip_waitmb_needed(int cpu)
+{
+ long curr;
+ long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ curr = rdssp->dynticks;
+ snap = rdssp->dynticks_snap;
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot have executed an RCU read-side critical section
+ * during that time, so there is no need for it to execute a
+ * memory barrier.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU either entered or exited an outermost interrupt,
+ * SMI, NMI, or whatever handler, then we know that it executed
+ * a memory barrier when doing so. So we don't need another one.
+ */
+ if (curr != snap)
+ return 0;
+
+ /* We need the CPU to execute a memory barrier. */
+
+ return 1;
+}
+
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+ long curr;
+ long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ curr = rdssp->dynticks;
+ snap = rdssp->sched_dynticks_snap;
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot be in the middle of an rcu_read_lock(), so
+ * the next rcu_read_lock() it executes must use the new value
+ * of the counter. Therefore, this CPU has been in a quiescent
+ * state the entire time, and we don't need to wait for it.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq handlers, then, as above, this CPU has already
+ * passed through a quiescent state.
+ */
+
+ if ((curr - snap) > 2 || (snap & 0x1) == 0)
+ return 0;
+
+ /* We need this CPU to go through a quiescent state. */
+
+ return 1;
+}
+
+#else /* !CONFIG_NO_HZ */
+
+# define dyntick_save_progress_counter(cpu) do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu) (1)
+# define rcu_try_flip_waitmb_needed(cpu) (1)
+
+# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
+
+#endif /* CONFIG_NO_HZ */
+
+static void save_qsctr_sched(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ /*
+ * If there has been a quiescent state, no more need to wait
+ * on this CPU.
+ */
+
+ if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+ smp_mb(); /* force ordering with cpu entering schedule(). */
+ return 0;
+ }
+
+ /* We need this CPU to go through a quiescent state. */
+
+ return 1;
+}
+
+/*
+ * Get here when RCU is idle. Decide whether we need to
+ * move out of idle state, and return non-zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+
+static int
+rcu_try_flip_idle(void)
+{
+ int cpu;
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
+ if (!rcu_pending(smp_processor_id())) {
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
+ return 0;
+ }
+
+ /*
+ * Do the flip.
+ */
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
+ rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
+
+ /*
+ * Need a memory barrier so that other CPUs see the new
+ * counter value before they see the subsequent change of all
+ * the rcu_flip_flag instances to rcu_flipped.
+ */
+
+ smp_mb(); /* see above block comment. */
+
+ /* Now ask each CPU for acknowledgement of the flip. */
+
+ for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+ per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+ dyntick_save_progress_counter(cpu);
+ }
+
+ return 1;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+
+static int
+rcu_try_flip_waitack(void)
+{
+ int cpu;
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
+ for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+ if (rcu_try_flip_waitack_needed(cpu) &&
+ per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
+ return 0;
+ }
+
+ /*
+ * Make sure our checks above don't bleed into subsequent
+ * waiting for the sum of the counters to reach zero.
+ */
+
+ smp_mb(); /* see above block comment. */
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
+ return 1;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+
+static int
+rcu_try_flip_waitzero(void)
+{
+ int cpu;
+ int lastidx = !(rcu_ctrlblk.completed & 0x1);
+ int sum = 0;
+
+ /* Check to see if the sum of the "last" counters is zero. */
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
+ for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+ sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
+ if (sum != 0) {
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
+ return 0;
+ }
+
+ /*
+ * This ensures that the other CPUs see the call for
+ * memory barriers -after- the sum to zero has been
+ * detected here
+ */
+ smp_mb(); /* ^^^^^^^^^^^^ */
+
+ /* Call for a memory barrier from each CPU. */
+ for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+ per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+ dyntick_save_progress_counter(cpu);
+ }
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
+ return 1;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+
+static int
+rcu_try_flip_waitmb(void)
+{
+ int cpu;
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
+ for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+ if (rcu_try_flip_waitmb_needed(cpu) &&
+ per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
+ return 0;
+ }
+
+ smp_mb(); /* Ensure that the above checks precede any following flip. */
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
+ return 1;
+}
+
+/*
+ * Attempt a single flip of the counters. Remember, a single flip does
+ * -not- constitute a grace period. Instead, the interval between
+ * at least GP_STAGES consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+ unsigned long flags;
+
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
+ if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
+ RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
+ return;
+ }
+
+ /*
+ * Take the next transition(s) through the RCU grace-period
+ * flip-counter state machine.
+ */
+
+ switch (rcu_ctrlblk.rcu_try_flip_state) {
+ case rcu_try_flip_idle_state:
+ if (rcu_try_flip_idle())
+ rcu_ctrlblk.rcu_try_flip_state =
+ rcu_try_flip_waitack_state;
+ break;
+ case rcu_try_flip_waitack_state:
+ if (rcu_try_flip_waitack())
+ rcu_ctrlblk.rcu_try_flip_state =
+ rcu_try_flip_waitzero_state;
+ break;
+ case rcu_try_flip_waitzero_state:
+ if (rcu_try_flip_waitzero())
+ rcu_ctrlblk.rcu_try_flip_state =
+ rcu_try_flip_waitmb_state;
+ break;
+ case rcu_try_flip_waitmb_state:
+ if (rcu_try_flip_waitmb())
+ rcu_ctrlblk.rcu_try_flip_state =
+ rcu_try_flip_idle_state;
+ }
+ spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+ if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
+ smp_mb(); /* Ensure RCU read-side accesses are visible. */
+ per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
+ }
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+ /*
+ * If this CPU took its interrupt from user mode or from the
+ * idle loop, and this is not a nested interrupt, then
+ * this CPU has to have exited all prior preept-disable
+ * sections of code. So increment the counter to note this.
+ *
+ * The memory barrier is needed to handle the case where
+ * writes from a preempt-disable section of code get reordered
+ * into schedule() by this CPU's write buffer. So the memory
+ * barrier makes sure that the rcu_qsctr_inc() is seen by other
+ * CPUs to happen after any such write.
+ */
+
+ if (user ||
+ (idle_cpu(cpu) && !in_softirq() &&
+ hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ smp_mb(); /* Guard against aggressive schedule(). */
+ rcu_qsctr_inc(cpu);
+ }
+
+ rcu_check_mb(cpu);
+ if (rcu_ctrlblk.completed == rdp->completed)
+ rcu_try_flip();
+ spin_lock_irqsave(&rdp->lock, flags);
+ RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+ __rcu_advance_callbacks(rdp);
+ if (rdp->donelist == NULL) {
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ } else {
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ raise_softirq(RCU_SOFTIRQ);
+ }
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+ if (rcu_ctrlblk.completed == rdp->completed) {
+ rcu_try_flip();
+ if (rcu_ctrlblk.completed == rdp->completed)
+ return;
+ }
+ spin_lock_irqsave(&rdp->lock, flags);
+ RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+ __rcu_advance_callbacks(rdp);
+ spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
+ *dsttail = srclist; \
+ if (srclist != NULL) { \
+ dsttail = srctail; \
+ srclist = NULL; \
+ srctail = &srclist;\
+ } \
+ } while (0)
+
+void rcu_offline_cpu(int cpu)
+{
+ int i;
+ struct rcu_head *list = NULL;
+ unsigned long flags;
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+ struct rcu_head *schedlist = NULL;
+ struct rcu_head **schedtail = &schedlist;
+ struct rcu_head **tail = &list;
+
+ /*
+ * Remove all callbacks from the newly dead CPU, retaining order.
+ * Otherwise rcu_barrier() will fail
+ */
+
+ spin_lock_irqsave(&rdp->lock, flags);
+ rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
+ for (i = GP_STAGES - 1; i >= 0; i--)
+ rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
+ list, tail);
+ rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+ rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+ schedlist, schedtail);
+ rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+ schedlist, schedtail);
+ rdp->rcu_sched_sleeping = 0;
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ rdp->waitlistcount = 0;
+
+ /* Disengage the newly dead CPU from the grace-period computation. */
+
+ spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+ rcu_check_mb(cpu);
+ if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+ smp_mb(); /* Subsequent counter accesses must see new value */
+ per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+ smp_mb(); /* Subsequent RCU read-side critical sections */
+ /* seen -after- acknowledgement. */
+ }
+
+ RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+ RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
+
+ RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
+ RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
+
+ cpu_clear(cpu, rcu_cpu_online_map);
+
+ spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+ /*
+ * Place the removed callbacks on the current CPU's queue.
+ * Make them all start a new grace period: simple approach,
+ * in theory could starve a given set of callbacks, but
+ * you would need to be doing some serious CPU hotplugging
+ * to make this happen. If this becomes a problem, adding
+ * a synchronize_rcu() to the hotplug path would be a simple
+ * fix.
+ */
+
+ local_irq_save(flags); /* disable preempt till we know what lock. */
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
+ *rdp->nexttail = list;
+ if (list)
+ rdp->nexttail = tail;
+ *rdp->nextschedtail = schedlist;
+ if (schedlist)
+ rdp->nextschedtail = schedtail;
+ spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+void __cpuinit rcu_online_cpu(int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+ cpu_set(cpu, rcu_cpu_online_map);
+ spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+ /*
+ * The rcu_sched grace-period processing might have bypassed
+ * this CPU, given that it was not in the rcu_cpu_online_map
+ * when the grace-period scan started. This means that the
+ * grace-period task might sleep. So make sure that if this
+ * should happen, the first callback posted to this CPU will
+ * wake up the grace-period task if need be.
+ */
+
+ rdp = RCU_DATA_CPU(cpu);
+ spin_lock_irqsave(&rdp->lock, flags);
+ rdp->rcu_sched_sleeping = 1;
+ spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ unsigned long flags;
+ struct rcu_head *next, *list;
+ struct rcu_data *rdp;
+
+ local_irq_save(flags);
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
+ list = rdp->donelist;
+ if (list == NULL) {
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ return;
+ }
+ rdp->donelist = NULL;
+ rdp->donetail = &rdp->donelist;
+ RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ while (list) {
+ next = list->next;
+ list->func(list);
+ list = next;
+ RCU_TRACE_ME(rcupreempt_trace_invoke);
+ }
+}
+
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ head->func = func;
+ head->next = NULL;
+ local_irq_save(flags);
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
+ __rcu_advance_callbacks(rdp);
+ *rdp->nexttail = head;
+ rdp->nexttail = &head->next;
+ RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
+ spin_unlock_irqrestore(&rdp->lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ int wake_gp = 0;
+
+ head->func = func;
+ head->next = NULL;
+ local_irq_save(flags);
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
+ *rdp->nextschedtail = head;
+ rdp->nextschedtail = &head->next;
+ if (rdp->rcu_sched_sleeping) {
+
+ /* Grace-period processing might be sleeping... */
+
+ rdp->rcu_sched_sleeping = 0;
+ wake_gp = 1;
+ }
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ if (wake_gp) {
+
+ /* Wake up grace-period processing, unless someone beat us. */
+
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+ wake_gp = 0;
+ rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ if (wake_gp)
+ wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+ }
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete. Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
+{
+ int couldsleep; /* might sleep after current pass. */
+ int couldsleepnext = 0; /* might sleep after next pass. */
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ int ret;
+
+ /*
+ * Each pass through the following loop handles one
+ * rcu_sched grace period cycle.
+ */
+ do {
+ /* Save each CPU's current state. */
+
+ for_each_online_cpu(cpu) {
+ dyntick_save_progress_counter_sched(cpu);
+ save_qsctr_sched(cpu);
+ }
+
+ /*
+ * Sleep for about an RCU grace-period's worth to
+ * allow better batching and to consume less CPU.
+ */
+ schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+
+ /*
+ * If there was nothing to do last time, prepare to
+ * sleep at the end of the current grace period cycle.
+ */
+ couldsleep = couldsleepnext;
+ couldsleepnext = 1;
+ if (couldsleep) {
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ }
+
+ /*
+ * Wait on each CPU in turn to have either visited
+ * a quiescent state or been in dynticks-idle mode.
+ */
+ for_each_online_cpu(cpu) {
+ while (rcu_qsctr_inc_needed(cpu) &&
+ rcu_qsctr_inc_needed_dyntick(cpu)) {
+ /* resched_cpu(cpu); @@@ */
+ schedule_timeout_interruptible(1);
+ }
+ }
+
+ /* Advance callbacks for each CPU. */
+
+ for_each_online_cpu(cpu) {
+
+ rdp = RCU_DATA_CPU(cpu);
+ spin_lock_irqsave(&rdp->lock, flags);
+
+ /*
+ * We are running on this CPU irq-disabled, so no
+ * CPU can go offline until we re-enable irqs.
+ * The current CPU might have already gone
+ * offline (between the for_each_offline_cpu and
+ * the spin_lock_irqsave), but in that case all its
+ * callback lists will be empty, so no harm done.
+ *
+ * Advance the callbacks! We share normal RCU's
+ * donelist, since callbacks are invoked the
+ * same way in either case.
+ */
+ if (rdp->waitschedlist != NULL) {
+ *rdp->donetail = rdp->waitschedlist;
+ rdp->donetail = rdp->waitschedtail;
+
+ /*
+ * Next rcu_check_callbacks() will
+ * do the required raise_softirq().
+ */
+ }
+ if (rdp->nextschedlist != NULL) {
+ rdp->waitschedlist = rdp->nextschedlist;
+ rdp->waitschedtail = rdp->nextschedtail;
+ couldsleep = 0;
+ couldsleepnext = 0;
+ } else {
+ rdp->waitschedlist = NULL;
+ rdp->waitschedtail = &rdp->waitschedlist;
+ }
+ rdp->nextschedlist = NULL;
+ rdp->nextschedtail = &rdp->nextschedlist;
+
+ /* Mark sleep intention. */
+
+ rdp->rcu_sched_sleeping = couldsleep;
+
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ }
+
+ /* If we saw callbacks on the last scan, go deal with them. */
+
+ if (!couldsleep)
+ continue;
+
+ /* Attempt to block... */
+
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+
+ /*
+ * Someone posted a callback after we scanned.
+ * Go take care of it.
+ */
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ couldsleepnext = 0;
+ continue;
+ }
+
+ /* Block until the next person posts a callback. */
+
+ rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ ret = 0;
+ __wait_event_interruptible(rcu_ctrlblk.sched_wq,
+ rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+ ret);
+
+ /*
+ * Signals would prevent us from sleeping, and we cannot
+ * do much with them in any case. So flush them.
+ */
+ if (ret)
+ flush_signals(current);
+ couldsleepnext = 0;
+
+ } while (!kthread_should_stop());
+
+ return (0);
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so. Assumes that notifiers would take care of handling any
+ * outstanding requests from the RCU core.
+ *
+ * This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+ return (rdp->donelist != NULL ||
+ !!rdp->waitlistcount ||
+ rdp->nextlist != NULL ||
+ rdp->nextschedlist != NULL ||
+ rdp->waitschedlist != NULL);
+}
+
+int rcu_pending(int cpu)
+{
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+ /* The CPU has at least one callback queued somewhere. */
+
+ if (rdp->donelist != NULL ||
+ !!rdp->waitlistcount ||
+ rdp->nextlist != NULL ||
+ rdp->nextschedlist != NULL ||
+ rdp->waitschedlist != NULL)
+ return 1;
+
+ /* The RCU core needs an acknowledgement from this CPU. */
+
+ if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
+ (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
+ return 1;
+
+ /* This CPU has fallen behind the global grace-period number. */
+
+ if (rdp->completed != rcu_ctrlblk.completed)
+ return 1;
+
+ /* Nothing needed from this CPU. */
+
+ return 0;
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ rcu_online_cpu(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ rcu_offline_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+ .notifier_call = rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+ int cpu;
+ int i;
+ struct rcu_data *rdp;
+
+ printk(KERN_NOTICE "Preemptible RCU implementation.\n");
+ for_each_possible_cpu(cpu) {
+ rdp = RCU_DATA_CPU(cpu);
+ spin_lock_init(&rdp->lock);
+ rdp->completed = 0;
+ rdp->waitlistcount = 0;
+ rdp->nextlist = NULL;
+ rdp->nexttail = &rdp->nextlist;
+ for (i = 0; i < GP_STAGES; i++) {
+ rdp->waitlist[i] = NULL;
+ rdp->waittail[i] = &rdp->waitlist[i];
+ }
+ rdp->donelist = NULL;
+ rdp->donetail = &rdp->donelist;
+ rdp->rcu_flipctr[0] = 0;
+ rdp->rcu_flipctr[1] = 0;
+ rdp->nextschedlist = NULL;
+ rdp->nextschedtail = &rdp->nextschedlist;
+ rdp->waitschedlist = NULL;
+ rdp->waitschedtail = &rdp->waitschedlist;
+ rdp->rcu_sched_sleeping = 0;
+ }
+ register_cpu_notifier(&rcu_nb);
+
+ /*
+ * We don't need protection against CPU-Hotplug here
+ * since
+ * a) If a CPU comes online while we are iterating over the
+ * cpu_online_map below, we would only end up making a
+ * duplicate call to rcu_online_cpu() which sets the corresponding
+ * CPU's mask in the rcu_cpu_online_map.
+ *
+ * b) A CPU cannot go offline at this point in time since the user
+ * does not have access to the sysfs interface, nor do we
+ * suspend the system.
+ */
+ for_each_online_cpu(cpu)
+ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
+
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
+ */
+void __init rcu_init_sched(void)
+{
+ rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+ NULL,
+ "rcu_sched_grace_period");
+ WARN_ON(IS_ERR(rcu_sched_grace_period_task));
+}
+
+#ifdef CONFIG_RCU_TRACE
+long *rcupreempt_flipctr(int cpu)
+{
+ return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
+
+int rcupreempt_flip_flag(int cpu)
+{
+ return per_cpu(rcu_flip_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
+
+int rcupreempt_mb_flag(int cpu)
+{
+ return per_cpu(rcu_mb_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
+
+char *rcupreempt_try_flip_state_name(void)
+{
+ return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
+
+struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
+{
+ struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+ return &rdp->trace;
+}
+EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
+
+#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 0000000..35c2d33
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,334 @@
+/*
+ * Read-Copy Update tracing for realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Papers: http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/rcupreempt_trace.h>
+#include <linux/debugfs.h>
+
+static struct mutex rcupreempt_trace_mutex;
+static char *rcupreempt_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
+{
+ trace->done_length += trace->wait_length;
+ trace->done_add += trace->wait_length;
+ trace->wait_length = 0;
+}
+void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
+{
+ trace->wait_length += trace->next_length;
+ trace->wait_add += trace->next_length;
+ trace->next_length = 0;
+}
+void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
+{
+ atomic_inc(&trace->rcu_try_flip_1);
+}
+void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
+{
+ atomic_inc(&trace->rcu_try_flip_e1);
+}
+void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_i1++;
+}
+void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_ie1++;
+}
+void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_g1++;
+}
+void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_a1++;
+}
+void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_ae1++;
+}
+void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_a2++;
+}
+void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_z1++;
+}
+void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_ze1++;
+}
+void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_z2++;
+}
+void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_m1++;
+}
+void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_me1++;
+}
+void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
+{
+ trace->rcu_try_flip_m2++;
+}
+void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
+{
+ trace->rcu_check_callbacks++;
+}
+void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
+{
+ trace->done_remove += trace->done_length;
+ trace->done_length = 0;
+}
+void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
+{
+ atomic_inc(&trace->done_invoked);
+}
+void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
+{
+ trace->next_add++;
+ trace->next_length++;
+}
+
+static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
+{
+ struct rcupreempt_trace *cp;
+ int cpu;
+
+ memset(sp, 0, sizeof(*sp));
+ for_each_possible_cpu(cpu) {
+ cp = rcupreempt_trace_cpu(cpu);
+ sp->next_length += cp->next_length;
+ sp->next_add += cp->next_add;
+ sp->wait_length += cp->wait_length;
+ sp->wait_add += cp->wait_add;
+ sp->done_length += cp->done_length;
+ sp->done_add += cp->done_add;
+ sp->done_remove += cp->done_remove;
+ atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+ sp->rcu_check_callbacks += cp->rcu_check_callbacks;
+ atomic_set(&sp->rcu_try_flip_1,
+ atomic_read(&cp->rcu_try_flip_1));
+ atomic_set(&sp->rcu_try_flip_e1,
+ atomic_read(&cp->rcu_try_flip_e1));
+ sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
+ sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
+ sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
+ sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
+ sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
+ sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
+ sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
+ sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
+ sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
+ sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
+ sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
+ sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
+ }
+}
+
+static ssize_t rcustats_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct rcupreempt_trace trace;
+ ssize_t bcount;
+ int cnt = 0;
+
+ rcupreempt_trace_sum(&trace);
+ mutex_lock(&rcupreempt_trace_mutex);
+ snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+ "ggp=%ld rcc=%ld\n",
+ rcu_batches_completed(),
+ trace.rcu_check_callbacks);
+ snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+ "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+ "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
+ "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
+
+ trace.next_add, trace.next_length,
+ trace.wait_add, trace.wait_length,
+ trace.done_add, trace.done_length,
+ trace.done_remove, atomic_read(&trace.done_invoked),
+ atomic_read(&trace.rcu_try_flip_1),
+ atomic_read(&trace.rcu_try_flip_e1),
+ trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
+ trace.rcu_try_flip_g1,
+ trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
+ trace.rcu_try_flip_a2,
+ trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
+ trace.rcu_try_flip_z2,
+ trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
+ trace.rcu_try_flip_m2);
+ bcount = simple_read_from_buffer(buffer, count, ppos,
+ rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+ mutex_unlock(&rcupreempt_trace_mutex);
+ return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ long oldgp = rcu_batches_completed();
+ ssize_t bcount;
+
+ mutex_lock(&rcupreempt_trace_mutex);
+ synchronize_rcu();
+ snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
+ "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
+ bcount = simple_read_from_buffer(buffer, count, ppos,
+ rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+ mutex_unlock(&rcupreempt_trace_mutex);
+ return bcount;
+}
+
+static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ int cnt = 0;
+ int cpu;
+ int f = rcu_batches_completed() & 0x1;
+ ssize_t bcount;
+
+ mutex_lock(&rcupreempt_trace_mutex);
+
+ cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
+ "CPU last cur F M\n");
+ for_each_online_cpu(cpu) {
+ long *flipctr = rcupreempt_flipctr(cpu);
+ cnt += snprintf(&rcupreempt_trace_buf[cnt],
+ RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+ "%3d %4ld %3ld %d %d\n",
+ cpu,
+ flipctr[!f],
+ flipctr[f],
+ rcupreempt_flip_flag(cpu),
+ rcupreempt_mb_flag(cpu));
+ }
+ cnt += snprintf(&rcupreempt_trace_buf[cnt],
+ RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+ "ggp = %ld, state = %s\n",
+ rcu_batches_completed(),
+ rcupreempt_try_flip_state_name());
+ cnt += snprintf(&rcupreempt_trace_buf[cnt],
+ RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+ "\n");
+ bcount = simple_read_from_buffer(buffer, count, ppos,
+ rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+ mutex_unlock(&rcupreempt_trace_mutex);
+ return bcount;
+}
+
+static struct file_operations rcustats_fops = {
+ .owner = THIS_MODULE,
+ .read = rcustats_read,
+};
+
+static struct file_operations rcugp_fops = {
+ .owner = THIS_MODULE,
+ .read = rcugp_read,
+};
+
+static struct file_operations rcuctrs_fops = {
+ .owner = THIS_MODULE,
+ .read = rcuctrs_read,
+};
+
+static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
+static int rcupreempt_debugfs_init(void)
+{
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto out;
+ statdir = debugfs_create_file("rcustats", 0444, rcudir,
+ NULL, &rcustats_fops);
+ if (!statdir)
+ goto free_out;
+
+ gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+ if (!gpdir)
+ goto free_out;
+
+ ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
+ NULL, &rcuctrs_fops);
+ if (!ctrsdir)
+ goto free_out;
+ return 0;
+free_out:
+ if (statdir)
+ debugfs_remove(statdir);
+ if (gpdir)
+ debugfs_remove(gpdir);
+ debugfs_remove(rcudir);
+out:
+ return 1;
+}
+
+static int __init rcupreempt_trace_init(void)
+{
+ int ret;
+
+ mutex_init(&rcupreempt_trace_mutex);
+ rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+ if (!rcupreempt_trace_buf)
+ return 1;
+ ret = rcupreempt_debugfs_init();
+ if (ret)
+ kfree(rcupreempt_trace_buf);
+ return ret;
+}
+
+static void __exit rcupreempt_trace_cleanup(void)
+{
+ debugfs_remove(statdir);
+ debugfs_remove(gpdir);
+ debugfs_remove(ctrsdir);
+ debugfs_remove(rcudir);
+ kfree(rcupreempt_trace_buf);
+}
+
+
+module_init(rcupreempt_trace_init);
+module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 0000000..85cb905
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,1157 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Josh Triplett <josh@freedesktop.org>
+ *
+ * See also: Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+ "Josh Triplett <josh@freedesktop.org>");
+
+static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
+static int nfakewriters = 4; /* # fake writer threads */
+static int stat_interval; /* Interval between stats, in seconds. */
+ /* Defaults to "only at end of test". */
+static int verbose; /* Print more debug info. */
+static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
+static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
+static int stutter = 5; /* Start/stop testing interval (in sec) */
+static int irqreader = 1; /* RCU readers from irq (timers). */
+static char *torture_type = "rcu"; /* What RCU implementation to torture. */
+
+module_param(nreaders, int, 0444);
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+module_param(nfakewriters, int, 0444);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
+module_param(stat_interval, int, 0444);
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+module_param(verbose, bool, 0444);
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+module_param(test_no_idle_hz, bool, 0444);
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+module_param(shuffle_interval, int, 0444);
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
+
+#define TORTURE_FLAG "-torture:"
+#define PRINTK_STRING(s) \
+ do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+static struct task_struct *shuffler_task;
+static struct task_struct *stutter_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+ struct rcu_head rtort_rcu;
+ int rtort_pipe_count;
+ struct list_head rtort_free;
+ int rtort_mbtest;
+};
+
+static int fullstop = 0; /* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+ { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+ { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_timers = 0;
+static struct list_head rcu_torture_removed;
+
+static int stutter_pause_test = 0;
+
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+static struct rcu_torture *
+rcu_torture_alloc(void)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&rcu_torture_lock);
+ if (list_empty(&rcu_torture_freelist)) {
+ atomic_inc(&n_rcu_torture_alloc_fail);
+ spin_unlock_bh(&rcu_torture_lock);
+ return NULL;
+ }
+ atomic_inc(&n_rcu_torture_alloc);
+ p = rcu_torture_freelist.next;
+ list_del_init(p);
+ spin_unlock_bh(&rcu_torture_lock);
+ return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+ atomic_inc(&n_rcu_torture_free);
+ spin_lock_bh(&rcu_torture_lock);
+ list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+ spin_unlock_bh(&rcu_torture_lock);
+}
+
+struct rcu_random_state {
+ unsigned long rrs_state;
+ long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801 /* prime */
+#define RCU_RANDOM_ADD 479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+static unsigned long
+rcu_random(struct rcu_random_state *rrsp)
+{
+ if (--rrsp->rrs_count < 0) {
+ rrsp->rrs_state +=
+ (unsigned long)cpu_clock(raw_smp_processor_id());
+ rrsp->rrs_count = RCU_RANDOM_REFRESH;
+ }
+ rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+ return swahw32(rrsp->rrs_state);
+}
+
+static void
+rcu_stutter_wait(void)
+{
+ while (stutter_pause_test || !rcutorture_runnable)
+ if (rcutorture_runnable)
+ schedule_timeout_interruptible(1);
+ else
+ schedule_timeout_interruptible(round_jiffies_relative(HZ));
+}
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readdelay)(struct rcu_random_state *rrsp);
+ void (*readunlock)(int idx);
+ int (*completed)(void);
+ void (*deferredfree)(struct rcu_torture *p);
+ void (*sync)(void);
+ void (*cb_barrier)(void);
+ int (*stats)(char *page);
+ int irqcapable;
+ char *name;
+};
+static struct rcu_torture_ops *cur_ops = NULL;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_read_delay(struct rcu_random_state *rrsp)
+{
+ long delay;
+ const long longdelay = 200;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
+ if (!delay)
+ udelay(longdelay);
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+ return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ int i;
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (fullstop) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ rcu_torture_free(rp);
+ } else
+ cur_ops->deferredfree(rp);
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .readdelay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferredfree = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = rcu_barrier,
+ .stats = NULL,
+ .irqcapable = 1,
+ .name = "rcu"
+};
+
+static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
+{
+ int i;
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ cur_ops->sync();
+ list_add(&p->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void rcu_sync_torture_init(void)
+{
+ INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_sync_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .readdelay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irqcapable = 1,
+ .name = "rcu_sync"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+ return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+struct rcu_bh_torture_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
+static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
+{
+ struct rcu_bh_torture_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
+ complete(&rcu->completion);
+}
+
+static void rcu_bh_torture_synchronize(void)
+{
+ struct rcu_bh_torture_synchronize rcu;
+
+ init_completion(&rcu.completion);
+ call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
+ wait_for_completion(&rcu.completion);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferredfree = rcu_bh_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = rcu_barrier_bh,
+ .stats = NULL,
+ .irqcapable = 1,
+ .name = "rcu_bh"
+};
+
+static struct rcu_torture_ops rcu_bh_sync_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irqcapable = 1,
+ .name = "rcu_bh_sync"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+static struct srcu_struct srcu_ctl;
+
+static void srcu_torture_init(void)
+{
+ init_srcu_struct(&srcu_ctl);
+ rcu_sync_torture_init();
+}
+
+static void srcu_torture_cleanup(void)
+{
+ synchronize_srcu(&srcu_ctl);
+ cleanup_srcu_struct(&srcu_ctl);
+}
+
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+{
+ return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct rcu_random_state *rrsp)
+{
+ long delay;
+ const long uspertick = 1000000 / HZ;
+ const long longdelay = 10;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+ if (!delay)
+ schedule_timeout_interruptible(longdelay);
+}
+
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+{
+ srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+ return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_synchronize(void)
+{
+ synchronize_srcu(&srcu_ctl);
+}
+
+static int srcu_torture_stats(char *page)
+{
+ int cnt = 0;
+ int cpu;
+ int idx = srcu_ctl.completed & 0x1;
+
+ cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+ per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
+ per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+ }
+ cnt += sprintf(&page[cnt], "\n");
+ return cnt;
+}
+
+static struct rcu_torture_ops srcu_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .readdelay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static int sched_torture_completed(void)
+{
+ return 0;
+}
+
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static void sched_torture_synchronize(void)
+{
+ synchronize_sched();
+}
+
+static struct rcu_torture_ops sched_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferredfree = rcu_sched_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = rcu_barrier_sched,
+ .stats = NULL,
+ .irqcapable = 1,
+ .name = "sched"
+};
+
+static struct rcu_torture_ops sched_ops_sync = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .name = "sched_sync"
+};
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ int i;
+ long oldbatch = rcu_batches_completed();
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_RCU_RANDOM(rand);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+ set_user_nice(current, 19);
+
+ do {
+ schedule_timeout_uninterruptible(1);
+ if ((rp = rcu_torture_alloc()) == NULL)
+ continue;
+ rp->rtort_pipe_count = 0;
+ udelay(rcu_random(&rand) & 0x3ff);
+ old_rp = rcu_torture_current;
+ rp->rtort_mbtest = 1;
+ rcu_assign_pointer(rcu_torture_current, rp);
+ smp_wmb();
+ if (old_rp) {
+ i = old_rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ old_rp->rtort_pipe_count++;
+ cur_ops->deferredfree(old_rp);
+ }
+ rcu_torture_current_version++;
+ oldbatch = cur_ops->completed();
+ rcu_stutter_wait();
+ } while (!kthread_should_stop() && !fullstop);
+ VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ return 0;
+}
+
+/*
+ * RCU torture fake writer kthread. Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+ DEFINE_RCU_RANDOM(rand);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, 19);
+
+ do {
+ schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
+ udelay(rcu_random(&rand) & 0x3ff);
+ cur_ops->sync();
+ rcu_stutter_wait();
+ } while (!kthread_should_stop() && !fullstop);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ return 0;
+}
+
+/*
+ * RCU torture reader from timer handler. Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+ int idx;
+ int completed;
+ static DEFINE_RCU_RANDOM(rand);
+ static DEFINE_SPINLOCK(rand_lock);
+ struct rcu_torture *p;
+ int pipe_count;
+
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ p = rcu_dereference(rcu_torture_current);
+ if (p == NULL) {
+ /* Leave because rcu_torture_writer is not yet underway */
+ cur_ops->readunlock(idx);
+ return;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ spin_lock(&rand_lock);
+ cur_ops->readdelay(&rand);
+ n_rcu_torture_timers++;
+ spin_unlock(&rand_lock);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ ++__get_cpu_var(rcu_torture_count)[pipe_count];
+ completed = cur_ops->completed() - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ ++__get_cpu_var(rcu_torture_batch)[completed];
+ preempt_enable();
+ cur_ops->readunlock(idx);
+}
+
+/*
+ * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+ int completed;
+ int idx;
+ DEFINE_RCU_RANDOM(rand);
+ struct rcu_torture *p;
+ int pipe_count;
+ struct timer_list t;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+ set_user_nice(current, 19);
+ if (irqreader && cur_ops->irqcapable)
+ setup_timer_on_stack(&t, rcu_torture_timer, 0);
+
+ do {
+ if (irqreader && cur_ops->irqcapable) {
+ if (!timer_pending(&t))
+ mod_timer(&t, 1);
+ }
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
+ p = rcu_dereference(rcu_torture_current);
+ if (p == NULL) {
+ /* Wait for rcu_torture_writer to get underway */
+ cur_ops->readunlock(idx);
+ schedule_timeout_interruptible(HZ);
+ continue;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ cur_ops->readdelay(&rand);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ ++__get_cpu_var(rcu_torture_count)[pipe_count];
+ completed = cur_ops->completed() - completed;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ ++__get_cpu_var(rcu_torture_batch)[completed];
+ preempt_enable();
+ cur_ops->readunlock(idx);
+ schedule();
+ rcu_stutter_wait();
+ } while (!kthread_should_stop() && !fullstop);
+ VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+ if (irqreader && cur_ops->irqcapable)
+ del_timer_sync(&t);
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+ int cnt = 0;
+ int cpu;
+ int i;
+ long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+ batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ }
+ }
+ for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ if (pipesummary[i] != 0)
+ break;
+ }
+ cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
+ cnt += sprintf(&page[cnt],
+ "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+ "rtmbe: %d nt: %ld",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free),
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_timers);
+ if (atomic_read(&n_rcu_torture_mberror) != 0)
+ cnt += sprintf(&page[cnt], " !!!");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+ if (i > 1) {
+ cnt += sprintf(&page[cnt], "!!! ");
+ atomic_inc(&n_rcu_torture_error);
+ WARN_ON_ONCE(1);
+ }
+ cnt += sprintf(&page[cnt], "Reader Pipe: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+ cnt += sprintf(&page[cnt], "Reader Batch: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+ cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ cnt += sprintf(&page[cnt], " %d",
+ atomic_read(&rcu_torture_wcount[i]));
+ }
+ cnt += sprintf(&page[cnt], "\n");
+ if (cur_ops->stats)
+ cnt += cur_ops->stats(&page[cnt]);
+ return cnt;
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+ int cnt;
+
+ cnt = rcu_torture_printk(printk_buf);
+ printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+ VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ rcu_torture_stats_print();
+ } while (!kthread_should_stop());
+ VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+ return 0;
+}
+
+static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
+
+/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
+ * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
+ */
+static void rcu_torture_shuffle_tasks(void)
+{
+ cpumask_t tmp_mask;
+ int i;
+
+ cpus_setall(tmp_mask);
+ get_online_cpus();
+
+ /* No point in shuffling if there is only one online CPU (ex: UP) */
+ if (num_online_cpus() == 1) {
+ put_online_cpus();
+ return;
+ }
+
+ if (rcu_idle_cpu != -1)
+ cpu_clear(rcu_idle_cpu, tmp_mask);
+
+ set_cpus_allowed_ptr(current, &tmp_mask);
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ if (reader_tasks[i])
+ set_cpus_allowed_ptr(reader_tasks[i],
+ &tmp_mask);
+ }
+
+ if (fakewriter_tasks) {
+ for (i = 0; i < nfakewriters; i++)
+ if (fakewriter_tasks[i])
+ set_cpus_allowed_ptr(fakewriter_tasks[i],
+ &tmp_mask);
+ }
+
+ if (writer_task)
+ set_cpus_allowed_ptr(writer_task, &tmp_mask);
+
+ if (stats_task)
+ set_cpus_allowed_ptr(stats_task, &tmp_mask);
+
+ if (rcu_idle_cpu == -1)
+ rcu_idle_cpu = num_online_cpus() - 1;
+ else
+ rcu_idle_cpu--;
+
+ put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int
+rcu_torture_shuffle(void *arg)
+{
+ VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
+ do {
+ schedule_timeout_interruptible(shuffle_interval * HZ);
+ rcu_torture_shuffle_tasks();
+ } while (!kthread_should_stop());
+ VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
+ return 0;
+}
+
+/* Cause the rcutorture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int
+rcu_torture_stutter(void *arg)
+{
+ VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
+ do {
+ schedule_timeout_interruptible(stutter * HZ);
+ stutter_pause_test = 1;
+ if (!kthread_should_stop())
+ schedule_timeout_interruptible(stutter * HZ);
+ stutter_pause_test = 0;
+ } while (!kthread_should_stop());
+ VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+ return 0;
+}
+
+static inline void
+rcu_torture_print_module_parms(char *tag)
+{
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval=%d stutter=%d irqreader=%d\n",
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ stutter, irqreader);
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+ int i;
+
+ fullstop = 1;
+ if (stutter_task) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
+ kthread_stop(stutter_task);
+ }
+ stutter_task = NULL;
+ if (shuffler_task) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+ kthread_stop(shuffler_task);
+ }
+ shuffler_task = NULL;
+
+ if (writer_task) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+ kthread_stop(writer_task);
+ }
+ writer_task = NULL;
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++) {
+ if (reader_tasks[i]) {
+ VERBOSE_PRINTK_STRING(
+ "Stopping rcu_torture_reader task");
+ kthread_stop(reader_tasks[i]);
+ }
+ reader_tasks[i] = NULL;
+ }
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+ rcu_torture_current = NULL;
+
+ if (fakewriter_tasks) {
+ for (i = 0; i < nfakewriters; i++) {
+ if (fakewriter_tasks[i]) {
+ VERBOSE_PRINTK_STRING(
+ "Stopping rcu_torture_fakewriter task");
+ kthread_stop(fakewriter_tasks[i]);
+ }
+ fakewriter_tasks[i] = NULL;
+ }
+ kfree(fakewriter_tasks);
+ fakewriter_tasks = NULL;
+ }
+
+ if (stats_task) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+ kthread_stop(stats_task);
+ }
+ stats_task = NULL;
+
+ /* Wait for all RCU callbacks to fire. */
+
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+
+ rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (cur_ops->cleanup)
+ cur_ops->cleanup();
+ if (atomic_read(&n_rcu_torture_error))
+ rcu_torture_print_module_parms("End of test: FAILURE");
+ else
+ rcu_torture_print_module_parms("End of test: SUCCESS");
+}
+
+static int __init
+rcu_torture_init(void)
+{
+ int i;
+ int cpu;
+ int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &srcu_ops, &sched_ops, &sched_ops_sync, };
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+ torture_type);
+ return (-EINVAL);
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nreaders >= 0)
+ nrealreaders = nreaders;
+ else
+ nrealreaders = 2 * num_online_cpus();
+ rcu_torture_print_module_parms("Start of test");
+ fullstop = 0;
+
+ /* Set up the freelist. */
+
+ INIT_LIST_HEAD(&rcu_torture_freelist);
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
+ rcu_tortures[i].rtort_mbtest = 0;
+ list_add_tail(&rcu_tortures[i].rtort_free,
+ &rcu_torture_freelist);
+ }
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ rcu_torture_current = NULL;
+ rcu_torture_current_version = 0;
+ atomic_set(&n_rcu_torture_alloc, 0);
+ atomic_set(&n_rcu_torture_alloc_fail, 0);
+ atomic_set(&n_rcu_torture_free, 0);
+ atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_error, 0);
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ atomic_set(&rcu_torture_wcount[i], 0);
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ per_cpu(rcu_torture_count, cpu)[i] = 0;
+ per_cpu(rcu_torture_batch, cpu)[i] = 0;
+ }
+ }
+
+ /* Start up the kthreads. */
+
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+ writer_task = kthread_run(rcu_torture_writer, NULL,
+ "rcu_torture_writer");
+ if (IS_ERR(writer_task)) {
+ firsterr = PTR_ERR(writer_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+ writer_task = NULL;
+ goto unwind;
+ }
+ fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_PRINTK_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nfakewriters; i++) {
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+ fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+ "rcu_torture_fakewriter");
+ if (IS_ERR(fakewriter_tasks[i])) {
+ firsterr = PTR_ERR(fakewriter_tasks[i]);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
+ fakewriter_tasks[i] = NULL;
+ goto unwind;
+ }
+ }
+ reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_PRINTK_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+ reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+ "rcu_torture_reader");
+ if (IS_ERR(reader_tasks[i])) {
+ firsterr = PTR_ERR(reader_tasks[i]);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+ reader_tasks[i] = NULL;
+ goto unwind;
+ }
+ }
+ if (stat_interval > 0) {
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+ stats_task = kthread_run(rcu_torture_stats, NULL,
+ "rcu_torture_stats");
+ if (IS_ERR(stats_task)) {
+ firsterr = PTR_ERR(stats_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+ stats_task = NULL;
+ goto unwind;
+ }
+ }
+ if (test_no_idle_hz) {
+ rcu_idle_cpu = num_online_cpus() - 1;
+ /* Create the shuffler thread */
+ shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
+ "rcu_torture_shuffle");
+ if (IS_ERR(shuffler_task)) {
+ firsterr = PTR_ERR(shuffler_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
+ shuffler_task = NULL;
+ goto unwind;
+ }
+ }
+ if (stutter < 0)
+ stutter = 0;
+ if (stutter) {
+ /* Create the stutter thread */
+ stutter_task = kthread_run(rcu_torture_stutter, NULL,
+ "rcu_torture_stutter");
+ if (IS_ERR(stutter_task)) {
+ firsterr = PTR_ERR(stutter_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
+ stutter_task = NULL;
+ goto unwind;
+ }
+ }
+ return 0;
+
+unwind:
+ rcu_torture_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 0000000..9d79b78
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1360 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relay.txt for an overview.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ * November 2006 - CPU hotplug support by Mathieu Desnoyers
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/splice.h>
+
+/* list of open channels, for cpu hotplug */
+static DEFINE_MUTEX(relay_channels_mutex);
+static LIST_HEAD(relay_channels);
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = vma->vm_private_data;
+ buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * fault() vm_op implementation for relay file mapping.
+ */
+static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page;
+ struct rchan_buf *buf = vma->vm_private_data;
+ pgoff_t pgoff = vmf->pgoff;
+
+ if (!buf)
+ return VM_FAULT_OOM;
+
+ page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
+ if (!page)
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+ .fault = relay_buf_fault,
+ .close = relay_file_mmap_close,
+};
+
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+ struct page **array;
+ size_t pa_size = n_pages * sizeof(struct page *);
+
+ if (pa_size > PAGE_SIZE) {
+ array = vmalloc(pa_size);
+ if (array)
+ memset(array, 0, pa_size);
+ } else {
+ array = kzalloc(pa_size, GFP_KERNEL);
+ }
+ return array;
+}
+
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+ if (is_vmalloc_addr(array))
+ vfree(array);
+ else
+ kfree(array);
+}
+
+/**
+ * relay_mmap_buf: - mmap channel buffer to process address space
+ * @buf: relay channel buffer
+ * @vma: vm_area_struct describing memory to be mapped
+ *
+ * Returns 0 if ok, negative on error
+ *
+ * Caller should already have grabbed mmap_sem.
+ */
+static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+ unsigned long length = vma->vm_end - vma->vm_start;
+ struct file *filp = vma->vm_file;
+
+ if (!buf)
+ return -EBADF;
+
+ if (length != (unsigned long)buf->chan->alloc_size)
+ return -EINVAL;
+
+ vma->vm_ops = &relay_file_mmap_ops;
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_private_data = buf;
+ buf->chan->cb->buf_mapped(buf, filp);
+
+ return 0;
+}
+
+/**
+ * relay_alloc_buf - allocate a channel buffer
+ * @buf: the buffer struct
+ * @size: total size of the buffer
+ *
+ * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
+ * passed in size will get page aligned, if it isn't already.
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
+{
+ void *mem;
+ unsigned int i, j, n_pages;
+
+ *size = PAGE_ALIGN(*size);
+ n_pages = *size >> PAGE_SHIFT;
+
+ buf->page_array = relay_alloc_page_array(n_pages);
+ if (!buf->page_array)
+ return NULL;
+
+ for (i = 0; i < n_pages; i++) {
+ buf->page_array[i] = alloc_page(GFP_KERNEL);
+ if (unlikely(!buf->page_array[i]))
+ goto depopulate;
+ set_page_private(buf->page_array[i], (unsigned long)buf);
+ }
+ mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+ if (!mem)
+ goto depopulate;
+
+ memset(mem, 0, *size);
+ buf->page_count = n_pages;
+ return mem;
+
+depopulate:
+ for (j = 0; j < i; j++)
+ __free_page(buf->page_array[j]);
+ relay_free_page_array(buf->page_array);
+ return NULL;
+}
+
+/**
+ * relay_create_buf - allocate and initialize a channel buffer
+ * @chan: the relay channel
+ *
+ * Returns channel buffer if successful, %NULL otherwise.
+ */
+static struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+ struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+ if (!buf->padding)
+ goto free_buf;
+
+ buf->start = relay_alloc_buf(buf, &chan->alloc_size);
+ if (!buf->start)
+ goto free_buf;
+
+ buf->chan = chan;
+ kref_get(&buf->chan->kref);
+ return buf;
+
+free_buf:
+ kfree(buf->padding);
+ kfree(buf);
+ return NULL;
+}
+
+/**
+ * relay_destroy_channel - free the channel struct
+ * @kref: target kernel reference that contains the relay channel
+ *
+ * Should only be called from kref_put().
+ */
+static void relay_destroy_channel(struct kref *kref)
+{
+ struct rchan *chan = container_of(kref, struct rchan, kref);
+ kfree(chan);
+}
+
+/**
+ * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ * @buf: the buffer struct
+ */
+static void relay_destroy_buf(struct rchan_buf *buf)
+{
+ struct rchan *chan = buf->chan;
+ unsigned int i;
+
+ if (likely(buf->start)) {
+ vunmap(buf->start);
+ for (i = 0; i < buf->page_count; i++)
+ __free_page(buf->page_array[i]);
+ relay_free_page_array(buf->page_array);
+ }
+ chan->buf[buf->cpu] = NULL;
+ kfree(buf->padding);
+ kfree(buf);
+ kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ * relay_remove_buf - remove a channel buffer
+ * @kref: target kernel reference that contains the relay buffer
+ *
+ * Removes the file from the fileystem, which also frees the
+ * rchan_buf_struct and the channel buffer. Should only be called from
+ * kref_put().
+ */
+static void relay_remove_buf(struct kref *kref)
+{
+ struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+ buf->chan->cb->remove_buf_file(buf->dentry);
+ relay_destroy_buf(buf);
+}
+
+/**
+ * relay_buf_empty - boolean, is the channel buffer empty?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is empty, 0 otherwise.
+ */
+static int relay_buf_empty(struct rchan_buf *buf)
+{
+ return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+
+/**
+ * relay_buf_full - boolean, is the channel buffer full?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+ size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+ return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior. Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback. Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ size_t prev_padding)
+{
+ if (relay_buf_full(buf))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * buf_mapped() default callback. Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback. Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback. Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+ struct dentry *parent,
+ int mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return NULL;
+}
+
+/*
+ * remove_buf_file() default callback. Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+ return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+ .subbuf_start = subbuf_start_default_callback,
+ .buf_mapped = buf_mapped_default_callback,
+ .buf_unmapped = buf_unmapped_default_callback,
+ .create_buf_file = create_buf_file_default_callback,
+ .remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ * wakeup_readers - wake up readers waiting on a channel
+ * @data: contains the channel buffer
+ *
+ * This is the timer function used to defer reader waking.
+ */
+static void wakeup_readers(unsigned long data)
+{
+ struct rchan_buf *buf = (struct rchan_buf *)data;
+ wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ * __relay_reset - reset a channel buffer
+ * @buf: the channel buffer
+ * @init: 1 if this is a first-time initialization
+ *
+ * See relay_reset() for description of effect.
+ */
+static void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+ size_t i;
+
+ if (init) {
+ init_waitqueue_head(&buf->read_wait);
+ kref_init(&buf->kref);
+ setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ } else
+ del_timer_sync(&buf->timer);
+
+ buf->subbufs_produced = 0;
+ buf->subbufs_consumed = 0;
+ buf->bytes_consumed = 0;
+ buf->finalized = 0;
+ buf->data = buf->start;
+ buf->offset = 0;
+
+ for (i = 0; i < buf->chan->n_subbufs; i++)
+ buf->padding[i] = 0;
+
+ buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ * relay_reset - reset the channel
+ * @chan: the channel
+ *
+ * This has the effect of erasing all data from all channel buffers
+ * and restarting the channel in its initial state. The buffers
+ * are not freed, so any mappings are still in effect.
+ *
+ * NOTE. Care should be taken that the channel isn't actually
+ * being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ if (chan->is_global && chan->buf[0]) {
+ __relay_reset(chan->buf[0], 0);
+ return;
+ }
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ __relay_reset(chan->buf[i], 0);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+ struct dentry *dentry)
+{
+ buf->dentry = dentry;
+ buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+ struct rchan_buf *buf,
+ unsigned int cpu)
+{
+ struct dentry *dentry;
+ char *tmpname;
+
+ tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ return NULL;
+ snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+ /* Create file in fs */
+ dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+ S_IRUSR, buf,
+ &chan->is_global);
+
+ kfree(tmpname);
+
+ return dentry;
+}
+
+/*
+ * relay_open_buf - create a new relay channel buffer
+ *
+ * used by relay_open() and CPU hotplug.
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
+{
+ struct rchan_buf *buf = NULL;
+ struct dentry *dentry;
+
+ if (chan->is_global)
+ return chan->buf[0];
+
+ buf = relay_create_buf(chan);
+ if (!buf)
+ return NULL;
+
+ if (chan->has_base_filename) {
+ dentry = relay_create_buf_file(chan, buf, cpu);
+ if (!dentry)
+ goto free_buf;
+ relay_set_buf_dentry(buf, dentry);
+ }
+
+ buf->cpu = cpu;
+ __relay_reset(buf, 1);
+
+ if(chan->is_global) {
+ chan->buf[0] = buf;
+ buf->cpu = 0;
+ }
+
+ return buf;
+
+free_buf:
+ relay_destroy_buf(buf);
+ return NULL;
+}
+
+/**
+ * relay_close_buf - close a channel buffer
+ * @buf: channel buffer
+ *
+ * Marks the buffer finalized and restores the default callbacks.
+ * The channel buffer and channel buffer data structure are then freed
+ * automatically when the last reference is given up.
+ */
+static void relay_close_buf(struct rchan_buf *buf)
+{
+ buf->finalized = 1;
+ del_timer_sync(&buf->timer);
+ kref_put(&buf->kref, relay_remove_buf);
+}
+
+static void setup_callbacks(struct rchan *chan,
+ struct rchan_callbacks *cb)
+{
+ if (!cb) {
+ chan->cb = &default_channel_callbacks;
+ return;
+ }
+
+ if (!cb->subbuf_start)
+ cb->subbuf_start = subbuf_start_default_callback;
+ if (!cb->buf_mapped)
+ cb->buf_mapped = buf_mapped_default_callback;
+ if (!cb->buf_unmapped)
+ cb->buf_unmapped = buf_unmapped_default_callback;
+ if (!cb->create_buf_file)
+ cb->create_buf_file = create_buf_file_default_callback;
+ if (!cb->remove_buf_file)
+ cb->remove_buf_file = remove_buf_file_default_callback;
+ chan->cb = cb;
+}
+
+/**
+ * relay_hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
+ */
+static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ struct rchan *chan;
+
+ switch(action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&relay_channels_mutex);
+ list_for_each_entry(chan, &relay_channels, list) {
+ if (chan->buf[hotcpu])
+ continue;
+ chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
+ if(!chan->buf[hotcpu]) {
+ printk(KERN_ERR
+ "relay_hotcpu_callback: cpu %d buffer "
+ "creation failed\n", hotcpu);
+ mutex_unlock(&relay_channels_mutex);
+ return NOTIFY_BAD;
+ }
+ }
+ mutex_unlock(&relay_channels_mutex);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /* No need to flush the cpu : will be flushed upon
+ * final relay_flush() call. */
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/**
+ * relay_open - create a new relay channel
+ * @base_filename: base name of files to create, %NULL for buffering only
+ * @parent: dentry of parent directory, %NULL for root directory or buffer
+ * @subbuf_size: size of sub-buffers
+ * @n_subbufs: number of sub-buffers
+ * @cb: client callback functions
+ * @private_data: user-defined data
+ *
+ * Returns channel pointer if successful, %NULL otherwise.
+ *
+ * Creates a channel buffer for each cpu using the sizes and
+ * attributes specified. The created channel buffer files
+ * will be named base_filename0...base_filenameN-1. File
+ * permissions will be %S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+ struct dentry *parent,
+ size_t subbuf_size,
+ size_t n_subbufs,
+ struct rchan_callbacks *cb,
+ void *private_data)
+{
+ unsigned int i;
+ struct rchan *chan;
+
+ if (!(subbuf_size && n_subbufs))
+ return NULL;
+
+ chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
+ if (!chan)
+ return NULL;
+
+ chan->version = RELAYFS_CHANNEL_VERSION;
+ chan->n_subbufs = n_subbufs;
+ chan->subbuf_size = subbuf_size;
+ chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ chan->parent = parent;
+ chan->private_data = private_data;
+ if (base_filename) {
+ chan->has_base_filename = 1;
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ }
+ setup_callbacks(chan, cb);
+ kref_init(&chan->kref);
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_online_cpu(i) {
+ chan->buf[i] = relay_open_buf(chan, i);
+ if (!chan->buf[i])
+ goto free_bufs;
+ }
+ list_add(&chan->list, &relay_channels);
+ mutex_unlock(&relay_channels_mutex);
+
+ return chan;
+
+free_bufs:
+ for_each_possible_cpu(i) {
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
+ }
+
+ kref_put(&chan->kref, relay_destroy_channel);
+ mutex_unlock(&relay_channels_mutex);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+struct rchan_percpu_buf_dispatcher {
+ struct rchan_buf *buf;
+ struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+ struct rchan_percpu_buf_dispatcher *p = info;
+
+ relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ * relay_late_setup_files - triggers file creation
+ * @chan: channel to operate on
+ * @base_filename: base name of files to create
+ * @parent: dentry of parent directory, %NULL for root directory
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ *
+ * Use to setup files for a previously buffer-only channel.
+ * Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+ const char *base_filename,
+ struct dentry *parent)
+{
+ int err = 0;
+ unsigned int i, curr_cpu;
+ unsigned long flags;
+ struct dentry *dentry;
+ struct rchan_percpu_buf_dispatcher disp;
+
+ if (!chan || !base_filename)
+ return -EINVAL;
+
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+ mutex_lock(&relay_channels_mutex);
+ /* Is chan already set up? */
+ if (unlikely(chan->has_base_filename)) {
+ mutex_unlock(&relay_channels_mutex);
+ return -EEXIST;
+ }
+ chan->has_base_filename = 1;
+ chan->parent = parent;
+ curr_cpu = get_cpu();
+ /*
+ * The CPU hotplug notifier ran before us and created buffers with
+ * no files associated. So it's safe to call relay_setup_buf_file()
+ * on all currently online CPUs.
+ */
+ for_each_online_cpu(i) {
+ if (unlikely(!chan->buf[i])) {
+ printk(KERN_ERR "relay_late_setup_files: CPU %u "
+ "has no buffer, it must have!\n", i);
+ BUG();
+ err = -EINVAL;
+ break;
+ }
+
+ dentry = relay_create_buf_file(chan, chan->buf[i], i);
+ if (unlikely(!dentry)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (curr_cpu == i) {
+ local_irq_save(flags);
+ relay_set_buf_dentry(chan->buf[i], dentry);
+ local_irq_restore(flags);
+ } else {
+ disp.buf = chan->buf[i];
+ disp.dentry = dentry;
+ smp_mb();
+ /* relay_channels_mutex must be held, so wait. */
+ err = smp_call_function_single(i,
+ __relay_set_buf_dentry,
+ &disp, 1);
+ }
+ if (unlikely(err))
+ break;
+ }
+ put_cpu();
+ mutex_unlock(&relay_channels_mutex);
+
+ return err;
+}
+
+/**
+ * relay_switch_subbuf - switch to a new sub-buffer
+ * @buf: channel buffer
+ * @length: size of current event
+ *
+ * Returns either the length passed in or 0 if full.
+ *
+ * Performs sub-buffer-switch tasks such as invoking callbacks,
+ * updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+ void *old, *new;
+ size_t old_subbuf, new_subbuf;
+
+ if (unlikely(length > buf->chan->subbuf_size))
+ goto toobig;
+
+ if (buf->offset != buf->chan->subbuf_size + 1) {
+ buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ buf->padding[old_subbuf] = buf->prev_padding;
+ buf->subbufs_produced++;
+ if (buf->dentry)
+ buf->dentry->d_inode->i_size +=
+ buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ else
+ buf->early_bytes += buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ smp_mb();
+ if (waitqueue_active(&buf->read_wait))
+ /*
+ * Calling wake_up_interruptible() from here
+ * will deadlock if we happen to be logging
+ * from the scheduler (trying to re-grab
+ * rq->lock), so defer it.
+ */
+ __mod_timer(&buf->timer, jiffies + 1);
+ }
+
+ old = buf->data;
+ new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ new = buf->start + new_subbuf * buf->chan->subbuf_size;
+ buf->offset = 0;
+ if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ buf->offset = buf->chan->subbuf_size + 1;
+ return 0;
+ }
+ buf->data = new;
+ buf->padding[new_subbuf] = 0;
+
+ if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+ goto toobig;
+
+ return length;
+
+toobig:
+ buf->chan->last_toobig = length;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ * @chan: the channel
+ * @cpu: the cpu associated with the channel buffer to update
+ * @subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ * Adds to the channel buffer's consumed sub-buffer count.
+ * subbufs_consumed should be the number of sub-buffers newly consumed,
+ * not the total consumed.
+ *
+ * NOTE. Kernel clients don't need to call this function if the channel
+ * mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+ unsigned int cpu,
+ size_t subbufs_consumed)
+{
+ struct rchan_buf *buf;
+
+ if (!chan)
+ return;
+
+ if (cpu >= NR_CPUS || !chan->buf[cpu])
+ return;
+
+ buf = chan->buf[cpu];
+ buf->subbufs_consumed += subbufs_consumed;
+ if (buf->subbufs_consumed > buf->subbufs_produced)
+ buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ * relay_close - close the channel
+ * @chan: the channel
+ *
+ * Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ mutex_lock(&relay_channels_mutex);
+ if (chan->is_global && chan->buf[0])
+ relay_close_buf(chan->buf[0]);
+ else
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
+
+ if (chan->last_toobig)
+ printk(KERN_WARNING "relay: one or more items not logged "
+ "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+ chan->last_toobig, chan->subbuf_size);
+
+ list_del(&chan->list);
+ kref_put(&chan->kref, relay_destroy_channel);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ * relay_flush - close the channel
+ * @chan: the channel
+ *
+ * Flushes all channel buffers, i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ if (chan->is_global && chan->buf[0]) {
+ relay_switch_subbuf(chan->buf[0], 0);
+ return;
+ }
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ relay_switch_subbuf(chan->buf[i], 0);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ * relay_file_open - open file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = inode->i_private;
+ kref_get(&buf->kref);
+ filp->private_data = buf;
+
+ return nonseekable_open(inode, filp);
+}
+
+/**
+ * relay_file_mmap - mmap file op for relay files
+ * @filp: the file
+ * @vma: the vma describing what to map
+ *
+ * Calls upon relay_mmap_buf() to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = filp->private_data;
+ return relay_mmap_buf(buf, vma);
+}
+
+/**
+ * relay_file_poll - poll file op for relay files
+ * @filp: the file
+ * @wait: poll table
+ *
+ * Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+ unsigned int mask = 0;
+ struct rchan_buf *buf = filp->private_data;
+
+ if (buf->finalized)
+ return POLLERR;
+
+ if (filp->f_mode & FMODE_READ) {
+ poll_wait(filp, &buf->read_wait, wait);
+ if (!relay_buf_empty(buf))
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ return mask;
+}
+
+/**
+ * relay_file_release - release file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Decrements the channel refcount, as the filesystem is
+ * no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = filp->private_data;
+ kref_put(&buf->kref, relay_remove_buf);
+
+ return 0;
+}
+
+/*
+ * relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t bytes_consumed)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t read_subbuf;
+
+ if (buf->subbufs_produced == buf->subbufs_consumed &&
+ buf->offset == buf->bytes_consumed)
+ return;
+
+ if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+
+ buf->bytes_consumed += bytes_consumed;
+ if (!read_pos)
+ read_subbuf = buf->subbufs_consumed % n_subbufs;
+ else
+ read_subbuf = read_pos / buf->chan->subbuf_size;
+ if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+ if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+ (buf->offset == subbuf_size))
+ return;
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+}
+
+/*
+ * relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t produced = buf->subbufs_produced;
+ size_t consumed = buf->subbufs_consumed;
+
+ relay_file_read_consume(buf, read_pos, 0);
+
+ consumed = buf->subbufs_consumed;
+
+ if (unlikely(buf->offset > subbuf_size)) {
+ if (produced == consumed)
+ return 0;
+ return 1;
+ }
+
+ if (unlikely(produced - consumed >= n_subbufs)) {
+ consumed = produced - n_subbufs + 1;
+ buf->subbufs_consumed = consumed;
+ buf->bytes_consumed = 0;
+ }
+
+ produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+ consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+
+ if (consumed > produced)
+ produced += n_subbufs * subbuf_size;
+
+ if (consumed == produced) {
+ if (buf->offset == subbuf_size &&
+ buf->subbufs_produced > buf->subbufs_consumed)
+ return 1;
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t padding, avail = 0;
+ size_t read_subbuf, read_offset, write_subbuf, write_offset;
+ size_t subbuf_size = buf->chan->subbuf_size;
+
+ write_subbuf = (buf->data - buf->start) / subbuf_size;
+ write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+ read_subbuf = read_pos / subbuf_size;
+ read_offset = read_pos % subbuf_size;
+ padding = buf->padding[read_subbuf];
+
+ if (read_subbuf == write_subbuf) {
+ if (read_offset + padding < write_offset)
+ avail = write_offset - (read_offset + padding);
+ } else
+ avail = (subbuf_size - padding) - read_offset;
+
+ return avail;
+}
+
+/**
+ * relay_file_read_start_pos - find the first available byte to read
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ *
+ * If the @read_pos is in the middle of padding, return the
+ * position of the first actually available byte, otherwise
+ * return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t read_subbuf, padding, padding_start, padding_end;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+ if (!read_pos)
+ read_pos = consumed * subbuf_size + buf->bytes_consumed;
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ padding_start = (read_subbuf + 1) * subbuf_size - padding;
+ padding_end = (read_subbuf + 1) * subbuf_size;
+ if (read_pos >= padding_start && read_pos < padding_end) {
+ read_subbuf = (read_subbuf + 1) % n_subbufs;
+ read_pos = read_subbuf * subbuf_size;
+ }
+
+ return read_pos;
+}
+
+/**
+ * relay_file_read_end_pos - return the new read position
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ * @count: number of bytes to be read
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t count)
+{
+ size_t read_subbuf, padding, end_pos;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ if (read_pos % subbuf_size + count + padding == subbuf_size)
+ end_pos = (read_subbuf + 1) * subbuf_size;
+ else
+ end_pos = read_pos + count;
+ if (end_pos >= subbuf_size * n_subbufs)
+ end_pos = 0;
+
+ return end_pos;
+}
+
+/*
+ * subbuf_read_actor - read up to one subbuf's worth of data
+ */
+static int subbuf_read_actor(size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc,
+ read_actor_t actor)
+{
+ void *from;
+ int ret = 0;
+
+ from = buf->start + read_start;
+ ret = avail;
+ if (copy_to_user(desc->arg.buf, from, avail)) {
+ desc->error = -EFAULT;
+ ret = 0;
+ }
+ desc->arg.data += ret;
+ desc->written += ret;
+ desc->count -= ret;
+
+ return ret;
+}
+
+typedef int (*subbuf_actor_t) (size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc,
+ read_actor_t actor);
+
+/*
+ * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
+ subbuf_actor_t subbuf_actor,
+ read_actor_t actor,
+ read_descriptor_t *desc)
+{
+ struct rchan_buf *buf = filp->private_data;
+ size_t read_start, avail;
+ int ret;
+
+ if (!desc->count)
+ return 0;
+
+ mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ do {
+ if (!relay_file_read_avail(buf, *ppos))
+ break;
+
+ read_start = relay_file_read_start_pos(*ppos, buf);
+ avail = relay_file_read_subbuf_avail(read_start, buf);
+ if (!avail)
+ break;
+
+ avail = min(desc->count, avail);
+ ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ if (desc->error < 0)
+ break;
+
+ if (ret) {
+ relay_file_read_consume(buf, read_start, ret);
+ *ppos = relay_file_read_end_pos(buf, read_start, ret);
+ }
+ } while (desc->count && ret);
+ mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+
+ return desc->written;
+}
+
+static ssize_t relay_file_read(struct file *filp,
+ char __user *buffer,
+ size_t count,
+ loff_t *ppos)
+{
+ read_descriptor_t desc;
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.buf = buffer;
+ desc.error = 0;
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
+ NULL, &desc);
+}
+
+static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
+{
+ rbuf->bytes_consumed += bytes_consumed;
+
+ if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
+ relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
+ rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
+ }
+}
+
+static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct rchan_buf *rbuf;
+
+ rbuf = (struct rchan_buf *)page_private(buf->page);
+ relay_consume_bytes(rbuf, buf->private);
+}
+
+static struct pipe_buf_operations relay_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .confirm = generic_pipe_buf_confirm,
+ .release = relay_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
+/*
+ * subbuf_splice_actor - splice up to one subbuf's worth of data
+ */
+static int subbuf_splice_actor(struct file *in,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags,
+ int *nonpad_ret)
+{
+ unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+ struct rchan_buf *rbuf = in->private_data;
+ unsigned int subbuf_size = rbuf->chan->subbuf_size;
+ uint64_t pos = (uint64_t) *ppos;
+ uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
+ size_t read_start = (size_t) do_div(pos, alloc_size);
+ size_t read_subbuf = read_start / subbuf_size;
+ size_t padding = rbuf->padding[read_subbuf];
+ size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
+ struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial[PIPE_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .nr_pages = 0,
+ .partial = partial,
+ .flags = flags,
+ .ops = &relay_pipe_buf_ops,
+ .spd_release = relay_page_release,
+ };
+
+ if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
+ return 0;
+
+ /*
+ * Adjust read len, if longer than what is available
+ */
+ if (len > (subbuf_size - read_start % subbuf_size))
+ len = subbuf_size - read_start % subbuf_size;
+
+ subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
+ pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+ poff = read_start & ~PAGE_MASK;
+ nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+
+ for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
+ unsigned int this_len, this_end, private;
+ unsigned int cur_pos = read_start + total_len;
+
+ if (!len)
+ break;
+
+ this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
+ private = this_len;
+
+ spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
+ spd.partial[spd.nr_pages].offset = poff;
+
+ this_end = cur_pos + this_len;
+ if (this_end >= nonpad_end) {
+ this_len = nonpad_end - cur_pos;
+ private = this_len + padding;
+ }
+ spd.partial[spd.nr_pages].len = this_len;
+ spd.partial[spd.nr_pages].private = private;
+
+ len -= this_len;
+ total_len += this_len;
+ poff = 0;
+ pidx = (pidx + 1) % subbuf_pages;
+
+ if (this_end >= nonpad_end) {
+ spd.nr_pages++;
+ break;
+ }
+ }
+
+ if (!spd.nr_pages)
+ return 0;
+
+ ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
+ if (ret < 0 || ret < total_len)
+ return ret;
+
+ if (read_start + ret == nonpad_end)
+ ret += padding;
+
+ return ret;
+}
+
+static ssize_t relay_file_splice_read(struct file *in,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ ssize_t spliced;
+ int ret;
+ int nonpad_ret = 0;
+
+ ret = 0;
+ spliced = 0;
+
+ while (len && !spliced) {
+ ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (flags & SPLICE_F_NONBLOCK)
+ ret = -EAGAIN;
+ break;
+ }
+
+ *ppos += ret;
+ if (ret > len)
+ len = 0;
+ else
+ len -= ret;
+ spliced += nonpad_ret;
+ nonpad_ret = 0;
+ }
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
+const struct file_operations relay_file_operations = {
+ .open = relay_file_open,
+ .poll = relay_file_poll,
+ .mmap = relay_file_mmap,
+ .read = relay_file_read,
+ .llseek = no_llseek,
+ .release = relay_file_release,
+ .splice_read = relay_file_splice_read,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
+
+static __init int relay_init(void)
+{
+
+ hotcpu_notifier(relay_hotcpu_callback, 0);
+ return 0;
+}
+
+early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 0000000..f275c8e
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,139 @@
+/*
+ * resource cgroups
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/res_counter.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+
+void res_counter_init(struct res_counter *counter)
+{
+ spin_lock_init(&counter->lock);
+ counter->limit = (unsigned long long)LLONG_MAX;
+}
+
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+{
+ if (counter->usage + val > counter->limit) {
+ counter->failcnt++;
+ return -ENOMEM;
+ }
+
+ counter->usage += val;
+ if (counter->usage > counter->max_usage)
+ counter->max_usage = counter->usage;
+ return 0;
+}
+
+int res_counter_charge(struct res_counter *counter, unsigned long val)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ ret = res_counter_charge_locked(counter, val);
+ spin_unlock_irqrestore(&counter->lock, flags);
+ return ret;
+}
+
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+{
+ if (WARN_ON(counter->usage < val))
+ val = counter->usage;
+
+ counter->usage -= val;
+}
+
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ res_counter_uncharge_locked(counter, val);
+ spin_unlock_irqrestore(&counter->lock, flags);
+}
+
+
+static inline unsigned long long *
+res_counter_member(struct res_counter *counter, int member)
+{
+ switch (member) {
+ case RES_USAGE:
+ return &counter->usage;
+ case RES_MAX_USAGE:
+ return &counter->max_usage;
+ case RES_LIMIT:
+ return &counter->limit;
+ case RES_FAILCNT:
+ return &counter->failcnt;
+ };
+
+ BUG();
+ return NULL;
+}
+
+ssize_t res_counter_read(struct res_counter *counter, int member,
+ const char __user *userbuf, size_t nbytes, loff_t *pos,
+ int (*read_strategy)(unsigned long long val, char *st_buf))
+{
+ unsigned long long *val;
+ char buf[64], *s;
+
+ s = buf;
+ val = res_counter_member(counter, member);
+ if (read_strategy)
+ s += read_strategy(*val, s);
+ else
+ s += sprintf(s, "%llu\n", *val);
+ return simple_read_from_buffer((void __user *)userbuf, nbytes,
+ pos, buf, s - buf);
+}
+
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+ return *res_counter_member(counter, member);
+}
+
+int res_counter_memparse_write_strategy(const char *buf,
+ unsigned long long *res)
+{
+ char *end;
+ /* FIXME - make memparse() take const char* args */
+ *res = memparse((char *)buf, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ *res = PAGE_ALIGN(*res);
+ return 0;
+}
+
+int res_counter_write(struct res_counter *counter, int member,
+ const char *buf, write_strategy_fn write_strategy)
+{
+ char *end;
+ unsigned long flags;
+ unsigned long long tmp, *val;
+
+ if (write_strategy) {
+ if (write_strategy(buf, &tmp))
+ return -EINVAL;
+ } else {
+ tmp = simple_strtoull(buf, &end, 10);
+ if (*end != '\0')
+ return -EINVAL;
+ }
+ spin_lock_irqsave(&counter->lock, flags);
+ val = res_counter_member(counter, member);
+ *val = tmp;
+ spin_unlock_irqrestore(&counter->lock, flags);
+ return 0;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
new file mode 100644
index 0000000..e633106
--- /dev/null
+++ b/kernel/resource.c
@@ -0,0 +1,878 @@
+/*
+ * linux/kernel/resource.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 1999 Martin Mares <mj@ucw.cz>
+ *
+ * Arbitrary resource management.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/pfn.h>
+#include <asm/io.h>
+
+
+struct resource ioport_resource = {
+ .name = "PCI IO",
+ .start = 0,
+ .end = IO_SPACE_LIMIT,
+ .flags = IORESOURCE_IO,
+};
+EXPORT_SYMBOL(ioport_resource);
+
+struct resource iomem_resource = {
+ .name = "PCI mem",
+ .start = 0,
+ .end = -1,
+ .flags = IORESOURCE_MEM,
+};
+EXPORT_SYMBOL(iomem_resource);
+
+static DEFINE_RWLOCK(resource_lock);
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ if (p->child)
+ return p->child;
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+#ifdef CONFIG_PROC_FS
+
+enum { MAX_IORES_LEVEL = 5 };
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+ __acquires(resource_lock)
+{
+ struct resource *p = m->private;
+ loff_t l = 0;
+ read_lock(&resource_lock);
+ for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
+ ;
+ return p;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+ __releases(resource_lock)
+{
+ read_unlock(&resource_lock);
+}
+
+static int r_show(struct seq_file *m, void *v)
+{
+ struct resource *root = m->private;
+ struct resource *r = v, *p;
+ int width = root->end < 0x10000 ? 4 : 8;
+ int depth;
+
+ for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
+ if (p->parent == root)
+ break;
+ seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
+ depth * 2, "",
+ width, (unsigned long long) r->start,
+ width, (unsigned long long) r->end,
+ r->name ? r->name : "<BAD>");
+ return 0;
+}
+
+static const struct seq_operations resource_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = r_show,
+};
+
+static int ioports_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &resource_op);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+ m->private = &ioport_resource;
+ }
+ return res;
+}
+
+static int iomem_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &resource_op);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+ m->private = &iomem_resource;
+ }
+ return res;
+}
+
+static const struct file_operations proc_ioports_operations = {
+ .open = ioports_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations proc_iomem_operations = {
+ .open = iomem_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init ioresources_init(void)
+{
+ proc_create("ioports", 0, NULL, &proc_ioports_operations);
+ proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ return 0;
+}
+__initcall(ioresources_init);
+
+#endif /* CONFIG_PROC_FS */
+
+/* Return the conflict entry if you can't request it */
+static struct resource * __request_resource(struct resource *root, struct resource *new)
+{
+ resource_size_t start = new->start;
+ resource_size_t end = new->end;
+ struct resource *tmp, **p;
+
+ if (end < start)
+ return root;
+ if (start < root->start)
+ return root;
+ if (end > root->end)
+ return root;
+ p = &root->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp || tmp->start > end) {
+ new->sibling = tmp;
+ *p = new;
+ new->parent = root;
+ return NULL;
+ }
+ p = &tmp->sibling;
+ if (tmp->end < start)
+ continue;
+ return tmp;
+ }
+}
+
+static int __release_resource(struct resource *old)
+{
+ struct resource *tmp, **p;
+
+ p = &old->parent->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp)
+ break;
+ if (tmp == old) {
+ *p = tmp->sibling;
+ old->parent = NULL;
+ return 0;
+ }
+ p = &tmp->sibling;
+ }
+ return -EINVAL;
+}
+
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __request_resource(root, new);
+ write_unlock(&resource_lock);
+ return conflict ? -EBUSY : 0;
+}
+
+EXPORT_SYMBOL(request_resource);
+
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
+int release_resource(struct resource *old)
+{
+ int retval;
+
+ write_lock(&resource_lock);
+ retval = __release_resource(old);
+ write_unlock(&resource_lock);
+ return retval;
+}
+
+EXPORT_SYMBOL(release_resource);
+
+#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+static int find_next_system_ram(struct resource *res)
+{
+ resource_size_t start, end;
+ struct resource *p;
+
+ BUG_ON(!res);
+
+ start = res->start;
+ end = res->end;
+ BUG_ON(start >= end);
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ /* system ram is just marked as IORESOURCE_MEM */
+ if (p->flags != res->flags)
+ continue;
+ if (p->start > end) {
+ p = NULL;
+ break;
+ }
+ if ((p->end >= start) && (p->start < end))
+ break;
+ }
+ read_unlock(&resource_lock);
+ if (!p)
+ return -1;
+ /* copy data */
+ if (res->start < p->start)
+ res->start = p->start;
+ if (res->end > p->end)
+ res->end = p->end;
+ return 0;
+}
+int
+walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
+ int (*func)(unsigned long, unsigned long, void *))
+{
+ struct resource res;
+ unsigned long pfn, len;
+ u64 orig_end;
+ int ret = -1;
+ res.start = (u64) start_pfn << PAGE_SHIFT;
+ res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
+ pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+ len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
+ ret = (*func)(pfn, len, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#endif
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
+ void (*alignf)(void *, struct resource *,
+ resource_size_t, resource_size_t),
+ void *alignf_data)
+{
+ struct resource *this = root->child;
+
+ new->start = root->start;
+ /*
+ * Skip past an allocated resource that starts at 0, since the assignment
+ * of this->start - 1 to new->end below would cause an underflow.
+ */
+ if (this && this->start == 0) {
+ new->start = this->end + 1;
+ this = this->sibling;
+ }
+ for(;;) {
+ if (this)
+ new->end = this->start - 1;
+ else
+ new->end = root->end;
+ if (new->start < min)
+ new->start = min;
+ if (new->end > max)
+ new->end = max;
+ new->start = ALIGN(new->start, align);
+ if (alignf)
+ alignf(alignf_data, new, size, align);
+ if (new->start < new->end && new->end - new->start >= size - 1) {
+ new->end = new->start + size - 1;
+ return 0;
+ }
+ if (!this)
+ break;
+ new->start = this->end + 1;
+ this = this->sibling;
+ }
+ return -EBUSY;
+}
+
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum size to allocate
+ * @max: maximum size to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
+ */
+int allocate_resource(struct resource *root, struct resource *new,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
+ void (*alignf)(void *, struct resource *,
+ resource_size_t, resource_size_t),
+ void *alignf_data)
+{
+ int err;
+
+ write_lock(&resource_lock);
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ if (err >= 0 && __request_resource(root, new))
+ err = -EBUSY;
+ write_unlock(&resource_lock);
+ return err;
+}
+
+EXPORT_SYMBOL(allocate_resource);
+
+/*
+ * Insert a resource into the resource tree. If successful, return NULL,
+ * otherwise return the conflicting resource (compare to __request_resource())
+ */
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
+{
+ struct resource *first, *next;
+
+ for (;; parent = first) {
+ first = __request_resource(parent, new);
+ if (!first)
+ return first;
+
+ if (first == parent)
+ return first;
+
+ if ((first->start > new->start) || (first->end < new->end))
+ break;
+ if ((first->start == new->start) && (first->end == new->end))
+ break;
+ }
+
+ for (next = first; ; next = next->sibling) {
+ /* Partial overlap? Bad, and unfixable */
+ if (next->start < new->start || next->end > new->end)
+ return next;
+ if (!next->sibling)
+ break;
+ if (next->sibling->start > new->end)
+ break;
+ }
+
+ new->parent = parent;
+ new->sibling = next->sibling;
+ new->child = first;
+
+ next->sibling = NULL;
+ for (next = first; next; next = next->sibling)
+ next->parent = new;
+
+ if (parent->child == first) {
+ parent->child = new;
+ } else {
+ next = parent->child;
+ while (next->sibling != first)
+ next = next->sibling;
+ next->sibling = new;
+ }
+ return NULL;
+}
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __insert_resource(parent, new);
+ write_unlock(&resource_lock);
+ return conflict ? -EBUSY : 0;
+}
+
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+ if (new->parent)
+ return;
+
+ write_lock(&resource_lock);
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __insert_resource(root, new);
+ if (!conflict)
+ break;
+ if (conflict == root)
+ break;
+
+ /* Ok, expand resource to cover the conflict, then try again .. */
+ if (conflict->start < new->start)
+ new->start = conflict->start;
+ if (conflict->end > new->end)
+ new->end = conflict->end;
+
+ printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ }
+ write_unlock(&resource_lock);
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+{
+ struct resource *tmp, *parent = res->parent;
+ resource_size_t end = start + size - 1;
+ int result = -EBUSY;
+
+ write_lock(&resource_lock);
+
+ if ((start < parent->start) || (end > parent->end))
+ goto out;
+
+ for (tmp = res->child; tmp; tmp = tmp->sibling) {
+ if ((tmp->start < start) || (tmp->end > end))
+ goto out;
+ }
+
+ if (res->sibling && (res->sibling->start <= end))
+ goto out;
+
+ tmp = parent->child;
+ if (tmp != res) {
+ while (tmp->sibling != res)
+ tmp = tmp->sibling;
+ if (start <= tmp->end)
+ goto out;
+ }
+
+ res->start = start;
+ res->end = end;
+ result = 0;
+
+ out:
+ write_unlock(&resource_lock);
+ return result;
+}
+
+static void __init __reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ struct resource *parent = root;
+ struct resource *conflict;
+ struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+
+ if (!res)
+ return;
+
+ res->name = name;
+ res->start = start;
+ res->end = end;
+ res->flags = IORESOURCE_BUSY;
+
+ for (;;) {
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ break;
+ if (conflict != parent) {
+ parent = conflict;
+ if (!(conflict->flags & IORESOURCE_BUSY))
+ continue;
+ }
+
+ /* Uhhuh, that didn't work out.. */
+ kfree(res);
+ res = NULL;
+ break;
+ }
+
+ if (!res) {
+ /* failed, split and try again */
+
+ /* conflict covered whole area */
+ if (conflict->start <= start && conflict->end >= end)
+ return;
+
+ if (conflict->start > start)
+ __reserve_region_with_split(root, start, conflict->start-1, name);
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ resource_size_t common_start, common_end;
+
+ common_start = max(conflict->start, start);
+ common_end = min(conflict->end, end);
+ if (common_start < common_end)
+ __reserve_region_with_split(root, common_start, common_end, name);
+ }
+ if (conflict->end < end)
+ __reserve_region_with_split(root, conflict->end+1, end, name);
+ }
+
+}
+
+void __init reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ write_lock(&resource_lock);
+ __reserve_region_with_split(root, start, end, name);
+ write_unlock(&resource_lock);
+}
+
+EXPORT_SYMBOL(adjust_resource);
+
+/**
+ * resource_alignment - calculate resource's alignment
+ * @res: resource pointer
+ *
+ * Returns alignment on success, 0 (invalid alignment) on failure.
+ */
+resource_size_t resource_alignment(struct resource *res)
+{
+ switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
+ case IORESOURCE_SIZEALIGN:
+ return resource_size(res);
+ case IORESOURCE_STARTALIGN:
+ return res->start;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * This is compatibility stuff for IO resources.
+ *
+ * Note how this, unlike the above, knows about
+ * the IO flag meanings (busy etc).
+ *
+ * request_region creates a new busy region.
+ *
+ * check_region returns non-zero if the area is already busy.
+ *
+ * release_region releases a matching busy region.
+ */
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ */
+struct resource * __request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name)
+{
+ struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+ if (!res)
+ return NULL;
+
+ res->name = name;
+ res->start = start;
+ res->end = start + n - 1;
+ res->flags = IORESOURCE_BUSY;
+
+ write_lock(&resource_lock);
+
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ break;
+ if (conflict != parent) {
+ parent = conflict;
+ if (!(conflict->flags & IORESOURCE_BUSY))
+ continue;
+ }
+
+ /* Uhhuh, that didn't work out.. */
+ kfree(res);
+ res = NULL;
+ break;
+ }
+ write_unlock(&resource_lock);
+ return res;
+}
+EXPORT_SYMBOL(__request_region);
+
+/**
+ * __check_region - check if a resource region is busy or free
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * Returns 0 if the region is free at the moment it is checked,
+ * returns %-EBUSY if the region is busy.
+ *
+ * NOTE:
+ * This function is deprecated because its use is racy.
+ * Even if it returns 0, a subsequent call to request_region()
+ * may fail because another driver etc. just allocated the region.
+ * Do NOT use it. It will be removed from the kernel.
+ */
+int __check_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
+{
+ struct resource * res;
+
+ res = __request_region(parent, start, n, "check-region");
+ if (!res)
+ return -EBUSY;
+
+ release_resource(res);
+ kfree(res);
+ return 0;
+}
+EXPORT_SYMBOL(__check_region);
+
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
+void __release_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
+{
+ struct resource **p;
+ resource_size_t end;
+
+ p = &parent->child;
+ end = start + n - 1;
+
+ write_lock(&resource_lock);
+
+ for (;;) {
+ struct resource *res = *p;
+
+ if (!res)
+ break;
+ if (res->start <= start && res->end >= end) {
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+ if (res->start != start || res->end != end)
+ break;
+ *p = res->sibling;
+ write_unlock(&resource_lock);
+ kfree(res);
+ return;
+ }
+ p = &res->sibling;
+ }
+
+ write_unlock(&resource_lock);
+
+ printk(KERN_WARNING "Trying to free nonexistent resource "
+ "<%016llx-%016llx>\n", (unsigned long long)start,
+ (unsigned long long)end);
+}
+EXPORT_SYMBOL(__release_region);
+
+/*
+ * Managed region resource
+ */
+struct region_devres {
+ struct resource *parent;
+ resource_size_t start;
+ resource_size_t n;
+};
+
+static void devm_region_release(struct device *dev, void *res)
+{
+ struct region_devres *this = res;
+
+ __release_region(this->parent, this->start, this->n);
+}
+
+static int devm_region_match(struct device *dev, void *res, void *match_data)
+{
+ struct region_devres *this = res, *match = match_data;
+
+ return this->parent == match->parent &&
+ this->start == match->start && this->n == match->n;
+}
+
+struct resource * __devm_request_region(struct device *dev,
+ struct resource *parent, resource_size_t start,
+ resource_size_t n, const char *name)
+{
+ struct region_devres *dr = NULL;
+ struct resource *res;
+
+ dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return NULL;
+
+ dr->parent = parent;
+ dr->start = start;
+ dr->n = n;
+
+ res = __request_region(parent, start, n, name);
+ if (res)
+ devres_add(dev, dr);
+ else
+ devres_free(dr);
+
+ return res;
+}
+EXPORT_SYMBOL(__devm_request_region);
+
+void __devm_release_region(struct device *dev, struct resource *parent,
+ resource_size_t start, resource_size_t n)
+{
+ struct region_devres match_data = { parent, start, n };
+
+ __release_region(parent, start, n);
+ WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
+ &match_data));
+}
+EXPORT_SYMBOL(__devm_release_region);
+
+/*
+ * Called from init/main.c to reserve IO ports.
+ */
+#define MAXRESERVE 4
+static int __init reserve_setup(char *str)
+{
+ static int reserved;
+ static struct resource reserve[MAXRESERVE];
+
+ for (;;) {
+ int io_start, io_num;
+ int x = reserved;
+
+ if (get_option (&str, &io_start) != 2)
+ break;
+ if (get_option (&str, &io_num) == 0)
+ break;
+ if (x < MAXRESERVE) {
+ struct resource *res = reserve + x;
+ res->name = "reserved";
+ res->start = io_start;
+ res->end = io_start + io_num - 1;
+ res->flags = IORESOURCE_BUSY;
+ res->child = NULL;
+ if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+ reserved = x+1;
+ }
+ }
+ return 1;
+}
+
+__setup("reserve=", reserve_setup);
+
+/*
+ * Check if the requested addr and size spans more than any slot in the
+ * iomem resource tree.
+ */
+int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
+{
+ struct resource *p = &iomem_resource;
+ int err = 0;
+ loff_t l;
+
+ read_lock(&resource_lock);
+ for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ /*
+ * We can probably skip the resources without
+ * IORESOURCE_IO attribute?
+ */
+ if (p->start >= addr + size)
+ continue;
+ if (p->end < addr)
+ continue;
+ if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
+ PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ continue;
+ /*
+ * if a resource is "BUSY", it's not a hardware resource
+ * but a driver mapping of such a resource; we don't want
+ * to warn for those; some drivers legitimately map only
+ * partial hardware resources. (example: vesafb)
+ */
+ if (p->flags & IORESOURCE_BUSY)
+ continue;
+
+ printk(KERN_WARNING "resource map sanity check conflict: "
+ "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
+ (unsigned long long)addr,
+ (unsigned long long)(addr + size - 1),
+ (unsigned long long)p->start,
+ (unsigned long long)p->end,
+ p->name);
+ err = -1;
+ break;
+ }
+ read_unlock(&resource_lock);
+
+ return err;
+}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 0000000..5fcb4fe
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,239 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Copyright (C) 2006 Esben Nielsen
+ * Copyright (C) 2006 Kihon Technologies Inc.,
+ * Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/debug_locks.h>
+
+#include "rtmutex_common.h"
+
+# define TRACE_WARN_ON(x) WARN_ON(x)
+# define TRACE_BUG_ON(x) BUG_ON(x)
+
+# define TRACE_OFF() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ if (spin_is_locked(&current->pi_lock)) \
+ spin_unlock(&current->pi_lock); \
+ } \
+} while (0)
+
+# define TRACE_OFF_NOLOCK() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ } \
+} while (0)
+
+# define TRACE_BUG_LOCKED() \
+do { \
+ TRACE_OFF(); \
+ BUG(); \
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) { \
+ TRACE_OFF(); \
+ WARN_ON(1); \
+ } \
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) \
+ TRACE_BUG_LOCKED(); \
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+static int rt_trace_on = 1;
+
+static void printk_task(struct task_struct *p)
+{
+ if (p)
+ printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+ if (lock->name)
+ printk(" [%p] {%s}\n",
+ lock, lock->name);
+ else
+ printk(" [%p] {%s:%d}\n",
+ lock, lock->file, lock->line);
+
+ if (print_owner && rt_mutex_owner(lock)) {
+ printk(".. ->owner: %p\n", lock->owner);
+ printk(".. held by: ");
+ printk_task(rt_mutex_owner(lock));
+ printk("\n");
+ }
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ WARN_ON(!plist_head_empty(&task->pi_waiters));
+ WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+ struct rt_mutex *lock)
+{
+ struct task_struct *task;
+
+ if (!rt_trace_on || detect || !act_waiter)
+ return;
+
+ task = rt_mutex_owner(act_waiter->lock);
+ if (task && task != current) {
+ act_waiter->deadlock_task_pid = get_pid(task_pid(task));
+ act_waiter->deadlock_lock = lock;
+ }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+ struct task_struct *task;
+
+ if (!waiter->deadlock_lock || !rt_trace_on)
+ return;
+
+ rcu_read_lock();
+ task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return;
+ }
+
+ TRACE_OFF_NOLOCK();
+
+ printk("\n============================================\n");
+ printk( "[ BUG: circular locking deadlock detected! ]\n");
+ printk( "--------------------------------------------\n");
+ printk("%s/%d is deadlocking current task %s/%d\n\n",
+ task->comm, task_pid_nr(task),
+ current->comm, task_pid_nr(current));
+
+ printk("\n1) %s/%d is trying to acquire this lock:\n",
+ current->comm, task_pid_nr(current));
+ printk_lock(waiter->lock, 1);
+
+ printk("\n2) %s/%d is blocked on this lock:\n",
+ task->comm, task_pid_nr(task));
+ printk_lock(waiter->deadlock_lock, 1);
+
+ debug_show_held_locks(current);
+ debug_show_held_locks(task);
+
+ printk("\n%s/%d's [blocked] stackdump:\n\n",
+ task->comm, task_pid_nr(task));
+ show_stack(task, NULL);
+ printk("\n%s/%d's [current] stackdump:\n\n",
+ current->comm, task_pid_nr(current));
+ dump_stack();
+ debug_show_all_locks();
+ rcu_read_unlock();
+
+ printk("[ turning off deadlock detection."
+ "Please report this trace. ]\n\n");
+ local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock)
+{
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+}
+
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
+{
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ memset(waiter, 0x11, sizeof(*waiter));
+ plist_node_init(&waiter->list_entry, MAX_PRIO);
+ plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+ waiter->deadlock_task_pid = NULL;
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ put_pid(waiter->deadlock_task_pid);
+ TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+ TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+ TRACE_WARN_ON(waiter->task);
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lock->name = name;
+}
+
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 0000000..14193d5
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,33 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w) \
+ do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ int detect)
+{
+ return (waiter != NULL);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 0000000..a56f629
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,443 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+#include <linux/freezer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS 8
+#define MAX_RT_TEST_MUTEXES 8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+ int opcode;
+ int opdata;
+ int mutexes[MAX_RT_TEST_MUTEXES];
+ int bkl;
+ int event;
+ struct sys_device sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+ RTTEST_NOP = 0,
+ RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
+ RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
+ RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
+ RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
+ RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
+ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
+ RTTEST_LOCKBKL, /* 9 Lock BKL */
+ RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
+ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
+ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
+ RTTEST_RESET = 99, /* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+ int i, id, ret = -EINVAL;
+
+ switch(td->opcode) {
+
+ case RTTEST_NOP:
+ return 0;
+
+ case RTTEST_LOCKCONT:
+ td->mutexes[td->opdata] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return 0;
+
+ case RTTEST_RESET:
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+ if (td->mutexes[i] == 4) {
+ rt_mutex_unlock(&mutexes[i]);
+ td->mutexes[i] = 0;
+ }
+ }
+
+ if (!lockwakeup && td->bkl == 4) {
+ unlock_kernel();
+ td->bkl = 0;
+ }
+ return 0;
+
+ case RTTEST_RESETEVENT:
+ atomic_set(&rttest_event, 0);
+ return 0;
+
+ default:
+ if (lockwakeup)
+ return ret;
+ }
+
+ switch(td->opcode) {
+
+ case RTTEST_LOCK:
+ case RTTEST_LOCKNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_lock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 4;
+ return 0;
+
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKINTNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = ret ? 0 : 4;
+ return ret ? -EINTR : 0;
+
+ case RTTEST_UNLOCK:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+ return ret;
+
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_unlock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 0;
+ return 0;
+
+ case RTTEST_LOCKBKL:
+ if (td->bkl)
+ return 0;
+ td->bkl = 1;
+ lock_kernel();
+ td->bkl = 4;
+ return 0;
+
+ case RTTEST_UNLOCKBKL:
+ if (td->bkl != 4)
+ break;
+ unlock_kernel();
+ td->bkl = 0;
+ return 0;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+ int tid, op, dat;
+ struct test_thread_data *td;
+
+ /* We have to lookup the task */
+ for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+ if (threads[tid] == current)
+ break;
+ }
+
+ BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+ td = &thread_data[tid];
+
+ op = td->opcode;
+ dat = td->opdata;
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ break;
+
+ if (td->mutexes[dat] != 1)
+ break;
+
+ td->mutexes[dat] = 2;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKBKL:
+ default:
+ break;
+ }
+
+ schedule();
+
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 3;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return;
+
+ case RTTEST_LOCKBKL:
+ return;
+ default:
+ return;
+ }
+
+ td->opcode = 0;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ int ret;
+
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 1);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (td->opcode == RTTEST_LOCKCONT)
+ break;
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ }
+
+ /* Restore previous command and data */
+ td->opcode = op;
+ td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+ struct test_thread_data *td = data;
+ int ret;
+
+ current->flags |= PF_MUTEX_TESTER;
+ set_freezable();
+ allow_signal(SIGHUP);
+
+ for(;;) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 0);
+ set_current_state(TASK_INTERRUPTIBLE);
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ try_to_freeze();
+
+ if (signal_pending(current))
+ flush_signals(current);
+
+ if(kthread_should_stop())
+ break;
+ }
+ return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev: thread reference
+ * @buf: command for actual step
+ * @count: length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct sched_param schedpar;
+ struct test_thread_data *td;
+ char cmdbuf[32];
+ int op, dat, tid, ret;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tid = td->sysdev.id;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(cmdbuf))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+ if (count < 1)
+ return -EINVAL;
+
+ memcpy(cmdbuf, buf, count);
+ cmdbuf[count] = 0;
+
+ if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+ return -EINVAL;
+
+ switch (op) {
+ case RTTEST_SCHEDOT:
+ schedpar.sched_priority = 0;
+ ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+ if (ret)
+ return ret;
+ set_user_nice(current, 0);
+ break;
+
+ case RTTEST_SCHEDRT:
+ schedpar.sched_priority = dat;
+ ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+ if (ret)
+ return ret;
+ break;
+
+ case RTTEST_SIGNAL:
+ send_sig(SIGHUP, threads[tid], 0);
+ break;
+
+ default:
+ if (td->opcode > 0)
+ return -EBUSY;
+ td->opdata = dat;
+ td->opcode = op;
+ wake_up_process(threads[tid]);
+ }
+
+ return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev: thread to query
+ * @buf: char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+ char *buf)
+{
+ struct test_thread_data *td;
+ struct task_struct *tsk;
+ char *curr = buf;
+ int i;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tsk = threads[td->sysdev.id];
+
+ spin_lock(&rttest_lock);
+
+ curr += sprintf(curr,
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+ td->opcode, td->event, tsk->state,
+ (MAX_RT_PRIO - 1) - tsk->prio,
+ (MAX_RT_PRIO - 1) - tsk->normal_prio,
+ tsk->pi_blocked_on, td->bkl);
+
+ for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+ curr += sprintf(curr, "%d", td->mutexes[i]);
+
+ spin_unlock(&rttest_lock);
+
+ curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+ mutexes[td->sysdev.id].owner);
+
+ return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+ .name = "rttest",
+};
+
+static int init_test_thread(int id)
+{
+ thread_data[id].sysdev.cls = &rttest_sysclass;
+ thread_data[id].sysdev.id = id;
+
+ threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+ if (IS_ERR(threads[id]))
+ return PTR_ERR(threads[id]);
+
+ return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+ int ret, i;
+
+ spin_lock_init(&rttest_lock);
+
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+ rt_mutex_init(&mutexes[i]);
+
+ ret = sysdev_class_register(&rttest_sysclass);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+ ret = init_test_thread(i);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+ if (ret)
+ break;
+ }
+
+ printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+ return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 0000000..69d9cb9
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,1006 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ * Copyright (C) 2006 Esben Nielsen
+ *
+ * See Documentation/rt-mutex-design.txt for details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner bit1 bit0
+ * NULL 0 0 lock is free (fast acquire possible)
+ * NULL 0 1 invalid state
+ * NULL 1 0 Transitional State*
+ * NULL 1 1 invalid state
+ * taskpointer 0 0 lock is held (fast release possible)
+ * taskpointer 0 1 task is pending owner
+ * taskpointer 1 0 lock is held and has waiters
+ * taskpointer 1 1 task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set. This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask)
+{
+ unsigned long val = (unsigned long)owner | mask;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return task->normal_prio;
+
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+ task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+ int prio = rt_mutex_getprio(task);
+
+ if (task->prio != prio)
+ rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
+{
+ struct rt_mutex *lock;
+ struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+ int detect_deadlock, ret = 0, depth = 0;
+ unsigned long flags;
+
+ detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+ deadlock_detect);
+
+ /*
+ * The (de)boosting is a step by step approach with a lot of
+ * pitfalls. We want this to be preemptible and we want hold a
+ * maximum of two locks per step. So we have to check
+ * carefully whether things change under us.
+ */
+ again:
+ if (++depth > max_lock_depth) {
+ static int prev_max;
+
+ /*
+ * Print this only once. If the admin changes the limit,
+ * print a new message when reaching the limit again.
+ */
+ if (prev_max != max_lock_depth) {
+ prev_max = max_lock_depth;
+ printk(KERN_WARNING "Maximum lock depth %d reached "
+ "task: %s (%d)\n", max_lock_depth,
+ top_task->comm, task_pid_nr(top_task));
+ }
+ put_task_struct(task);
+
+ return deadlock_detect ? -EDEADLK : 0;
+ }
+ retry:
+ /*
+ * Task can not go away as we did a get_task() before !
+ */
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ /*
+ * Check whether the end of the boosting chain has been
+ * reached or the state of the chain has changed while we
+ * dropped the locks.
+ */
+ if (!waiter || !waiter->task)
+ goto out_unlock_pi;
+
+ /*
+ * Check the orig_waiter state. After we dropped the locks,
+ * the previous owner of the lock might have released the lock
+ * and made us the pending owner:
+ */
+ if (orig_waiter && !orig_waiter->task)
+ goto out_unlock_pi;
+
+ /*
+ * Drop out, when the task has no waiters. Note,
+ * top_waiter can be NULL, when we are in the deboosting
+ * mode!
+ */
+ if (top_waiter && (!task_has_pi_waiters(task) ||
+ top_waiter != task_top_pi_waiter(task)))
+ goto out_unlock_pi;
+
+ /*
+ * When deadlock detection is off then we check, if further
+ * priority adjustment is necessary.
+ */
+ if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+ goto out_unlock_pi;
+
+ lock = waiter->lock;
+ if (!spin_trylock(&lock->wait_lock)) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ cpu_relax();
+ goto retry;
+ }
+
+ /* Deadlock detection */
+ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+ debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ spin_unlock(&lock->wait_lock);
+ ret = deadlock_detect ? -EDEADLK : 0;
+ goto out_unlock_pi;
+ }
+
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* Requeue the waiter */
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->list_entry.prio = task->prio;
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ /* Release the task */
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /* Grab the next task */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ /* Boost the owner */
+ plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+
+ } else if (top_waiter == waiter) {
+ /* Deboost the owner */
+ plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+ waiter = rt_mutex_top_waiter(lock);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+ }
+
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ top_waiter = rt_mutex_top_waiter(lock);
+ spin_unlock(&lock->wait_lock);
+
+ if (!detect_deadlock && waiter != top_waiter)
+ goto out_put_task;
+
+ goto again;
+
+ out_unlock_pi:
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+ put_task_struct(task);
+
+ return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+ struct task_struct *pendowner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *next;
+ unsigned long flags;
+
+ if (!rt_mutex_owner_pending(lock))
+ return 0;
+
+ if (pendowner == current)
+ return 1;
+
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+ if (current->prio >= pendowner->prio) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 0;
+ }
+
+ /*
+ * Check if a waiter is enqueued on the pending owners
+ * pi_waiters list. Remove it and readjust pending owners
+ * priority.
+ */
+ if (likely(!rt_mutex_has_waiters(lock))) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 1;
+ }
+
+ /* No chain handling, pending owner is not blocked on anything: */
+ next = rt_mutex_top_waiter(lock);
+ plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+ __rt_mutex_adjust_prio(pendowner);
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ /*
+ * We are going to steal the lock and a waiter was
+ * enqueued on the pending owners pi_waiters queue. So
+ * we have to enqueue this waiter into
+ * current->pi_waiters list. This covers the case,
+ * where current is boosted because it holds another
+ * lock and gets unboosted because the booster is
+ * interrupted, so we would delay a waiter with higher
+ * priority as current->normal_prio.
+ *
+ * Note: in the rare case of a SCHED_OTHER task changing
+ * its priority and thus stealing the lock, next->task
+ * might be current:
+ */
+ if (likely(next->task != current)) {
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_add(&next->pi_list_entry, &current->pi_waiters);
+ __rt_mutex_adjust_prio(current);
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+ }
+ return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock)
+{
+ /*
+ * We have to be careful here if the atomic speedups are
+ * enabled, such that, when
+ * - no other waiter is on the lock
+ * - the lock has been released since we did the cmpxchg
+ * the lock can be released or taken while we are doing the
+ * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ *
+ * The atomic acquire/release aware variant of
+ * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+ * the WAITERS bit, the atomic release / acquire can not
+ * happen anymore and lock->wait_lock protects us from the
+ * non-atomic case.
+ *
+ * Note, that this might set lock->owner =
+ * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+ * any more. This is fixed up when we take the ownership.
+ * This is the transitional state explained at the top of this file.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ return 0;
+
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock);
+
+ rt_mutex_set_owner(lock, current, 0);
+
+ rt_mutex_deadlock_account_lock(lock, current);
+
+ return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *top_waiter = waiter;
+ unsigned long flags;
+ int chain_walk = 0, res;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ __rt_mutex_adjust_prio(current);
+ waiter->task = current;
+ waiter->lock = lock;
+ plist_node_init(&waiter->list_entry, current->prio);
+ plist_node_init(&waiter->pi_list_entry, current->prio);
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ current->pi_blocked_on = waiter;
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ spin_lock_irqsave(&owner->pi_lock, flags);
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+ else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ chain_walk = 1;
+
+ if (!chain_walk)
+ return 0;
+
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
+ spin_unlock(&lock->wait_lock);
+
+ res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+ current);
+
+ spin_lock(&lock->wait_lock);
+
+ return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+ struct task_struct *pendowner;
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+
+ waiter = rt_mutex_top_waiter(lock);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ /*
+ * Remove it from current->pi_waiters. We do not adjust a
+ * possible priority boost right now. We execute wakeup in the
+ * boosted mode and go back to normal after releasing
+ * lock->wait_lock.
+ */
+ plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+ pendowner = waiter->task;
+ waiter->task = NULL;
+
+ rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ /*
+ * Clear the pi_blocked_on variable and enqueue a possible
+ * waiter into the pi_waiters list of the pending owner. This
+ * prevents that in case the pending owner gets unboosted a
+ * waiter with higher priority than pending-owner->normal_prio
+ * is blocked on the unboosted (pending) owner.
+ */
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+ WARN_ON(!pendowner->pi_blocked_on);
+ WARN_ON(pendowner->pi_blocked_on != waiter);
+ WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+ pendowner->pi_blocked_on = NULL;
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+ }
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ int first = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ unsigned long flags;
+ int chain_walk = 0;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->task = NULL;
+ current->pi_blocked_on = NULL;
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (first && owner != current) {
+
+ spin_lock_irqsave(&owner->pi_lock, flags);
+
+ plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &owner->pi_waiters);
+ }
+ __rt_mutex_adjust_prio(owner);
+
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+
+ WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+ if (!chain_walk)
+ return;
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
+ spin_unlock(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+
+ spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || waiter->list_entry.prio == task->prio) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+ rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock)) {
+ spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout)) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ /*
+ * waiter.task is NULL the first time we come here and
+ * when we have been woken up by the previous owner
+ * but the lock got stolen by a higher prio task.
+ */
+ if (!waiter.task) {
+ ret = task_blocks_on_rt_mutex(lock, &waiter,
+ detect_deadlock);
+ /*
+ * If we got woken up by the owner then start loop
+ * all over without going into schedule to try
+ * to get the lock now:
+ */
+ if (unlikely(!waiter.task)) {
+ /*
+ * Reset the return value. We might
+ * have returned with -EDEADLK and the
+ * owner released the lock while we
+ * were walking the pi chain.
+ */
+ ret = 0;
+ continue;
+ }
+ if (unlikely(ret))
+ break;
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(&waiter);
+
+ if (waiter.task)
+ schedule_rt_mutex(lock);
+
+ spin_lock(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (unlikely(waiter.task))
+ remove_waiter(lock, &waiter);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout))
+ hrtimer_cancel(&timeout->timer);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might
+ * have been the pending owner and boosted. Since we did not
+ * take the lock, the PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = 0;
+
+ spin_lock(&lock->wait_lock);
+
+ if (likely(rt_mutex_owner(lock) != current)) {
+
+ ret = try_to_take_rt_mutex(lock);
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters
+ * bit unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ spin_lock(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ spin_unlock(&lock->wait_lock);
+ return;
+ }
+
+ wakeup_next_waiter(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, NULL, detect_deadlock);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout, int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, timeout, detect_deadlock);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 1;
+ }
+ return slowfn(lock);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEOUT when the timeout expired
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ lock->owner = NULL;
+ spin_lock_init(&lock->wait_lock);
+ plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+ debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ __rt_mutex_init(lock, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
+ rt_mutex_set_owner(lock, proxy_owner, 0);
+ rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL, 0);
+ rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ return NULL;
+
+ return rt_mutex_top_waiter(lock)->task;
+}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 0000000..a1a1dd0
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,26 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define rt_mutex_deadlock_check(l) (0)
+#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d) (d)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 0000000..e124bf5
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,130 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock) \
+ do { \
+ if (!(current->flags & PF_MUTEX_TESTER)) \
+ schedule(); \
+ else \
+ schedule_rt_mutex_test(_lock); \
+ } while (0)
+
+#else
+# define schedule_rt_mutex(_lock) schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry: pi node to enqueue into the mutex waiters list
+ * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct plist_node list_entry;
+ struct plist_node pi_list_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ struct pid *deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *w;
+
+ w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+ list_entry);
+ BUG_ON(w->lock != lock);
+
+ return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+ pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING 1UL
+#define RT_MUTEX_HAS_WAITERS 2UL
+#define RT_MUTEX_OWNER_MASKALL 3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+ return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 0000000..cae050b
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,148 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+/*
+ * lock for reading
+ */
+void __sched down_read(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_read_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void __sched down_write(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_write_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+ /*
+ * lockdep: a downgraded write will live on as a write
+ * dependency.
+ */
+ __downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
diff --git a/kernel/sched.c b/kernel/sched.c
new file mode 100644
index 0000000..db66874
--- /dev/null
+++ b/kernel/sched.c
@@ -0,0 +1,9390 @@
+/*
+ * kernel/sched.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ * hybrid priority-list and round-robin design with
+ * an array-switch method of distributing timeslices
+ * and per-CPU runqueues. Cleanups and useful suggestions
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
+ * 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/pid_namespace.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/reciprocal_div.h>
+#include <linux/unistd.h>
+#include <linux/pagemap.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/ctype.h>
+#include <linux/ftrace.h>
+#include <trace/sched.h>
+
+#include <asm/tlb.h>
+#include <asm/irq_regs.h>
+
+#include "sched_cpupri.h"
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define DEF_TIMESLICE (100 * HZ / 1000)
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
+
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
+static inline int rt_policy(int policy)
+{
+ if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ return 1;
+ return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+ return rt_policy(p->policy);
+}
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+
+struct rt_bandwidth {
+ /* nests inside the rq lock: */
+ spinlock_t rt_runtime_lock;
+ ktime_t rt_period;
+ u64 rt_runtime;
+ struct hrtimer rt_period_timer;
+};
+
+static struct rt_bandwidth def_rt_bandwidth;
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_bandwidth *rt_b =
+ container_of(timer, struct rt_bandwidth, rt_period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_rt_period_timer(rt_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+ rt_b->rt_period = ns_to_ktime(period);
+ rt_b->rt_runtime = runtime;
+
+ spin_lock_init(&rt_b->rt_runtime_lock);
+
+ hrtimer_init(&rt_b->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.function = sched_rt_period_timer;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+}
+
+static inline int rt_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ ktime_t now;
+
+ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ if (hrtimer_active(&rt_b->rt_period_timer))
+ return;
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ for (;;) {
+ if (hrtimer_active(&rt_b->rt_period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
+ hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
+ }
+ spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ hrtimer_cancel(&rt_b->rt_period_timer);
+}
+#endif
+
+/*
+ * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
+#ifdef CONFIG_GROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+
+static LIST_HEAD(task_groups);
+
+/* task group related information */
+struct task_group {
+#ifdef CONFIG_CGROUP_SCHED
+ struct cgroup_subsys_state css;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ struct rt_bandwidth rt_bandwidth;
+#endif
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+};
+
+#ifdef CONFIG_USER_SCHED
+
+/*
+ * Root task group.
+ * Every UID task group (including init_task_group aka UID-0) will
+ * be a child to this group.
+ */
+struct task_group root_task_group;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+#endif /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
+#define root_task_group init_task_group
+#endif /* CONFIG_USER_SCHED */
+
+/* task_group_lock serializes add/remove of task groups and also changes to
+ * a task group's cpu shares.
+ */
+static DEFINE_SPINLOCK(task_group_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
+#else /* !CONFIG_USER_SCHED */
+# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+#endif /* CONFIG_USER_SCHED */
+
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+#define MIN_SHARES 2
+#define MAX_SHARES (1UL << 18)
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
+ */
+struct task_group init_task_group;
+
+/* return group to which a task belongs */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct task_group *tg;
+
+#ifdef CONFIG_USER_SCHED
+ tg = p->user->tg;
+#elif defined(CONFIG_CGROUP_SCHED)
+ tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
+ struct task_group, css);
+#else
+ tg = &init_task_group;
+#endif
+ return tg;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+ p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = task_group(p)->rt_rq[cpu];
+ p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_GROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned long nr_running;
+
+ u64 exec_clock;
+ u64 min_vruntime;
+
+ struct rb_root tasks_timeline;
+ struct rb_node *rb_leftmost;
+
+ struct list_head tasks;
+ struct list_head *balance_iterator;
+
+ /*
+ * 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr, *next, *last;
+
+ unsigned int nr_spread_over;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /*
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+ /*
+ * the part of load.weight contributed by tasks
+ */
+ unsigned long task_weight;
+
+ /*
+ * h_load = weight * f(tg)
+ *
+ * Where f(tg) is the recursive weight fraction assigned to
+ * this group.
+ */
+ unsigned long h_load;
+
+ /*
+ * this cpu's part of tg->shares
+ */
+ unsigned long shares;
+
+ /*
+ * load.weight at the time we set shares
+ */
+ unsigned long rq_weight;
+#endif
+#endif
+};
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ int highest_prio; /* highest queued rt task prio */
+#endif
+#ifdef CONFIG_SMP
+ unsigned long rt_nr_migratory;
+ int overloaded;
+#endif
+ int rt_throttled;
+ u64 rt_time;
+ u64 rt_runtime;
+ /* Nests inside the rq lock: */
+ spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
+ struct rq *rq;
+ struct list_head leaf_rt_rq_list;
+ struct task_group *tg;
+ struct sched_rt_entity *rt_se;
+#endif
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ cpumask_t span;
+ cpumask_t online;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_t rto_mask;
+ atomic_t rto_count;
+#ifdef CONFIG_SMP
+ struct cpupri cpupri;
+#endif
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+ /* runqueue lock: */
+ spinlock_t lock;
+
+ /*
+ * nr_running and cpu_load should be in the same cacheline because
+ * remote CPUs use both these fields when doing load calculation.
+ */
+ unsigned long nr_running;
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned long last_tick_seen;
+ unsigned char in_nohz_recently;
+#endif
+ /* capture load from *all* tasks on this cpu: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this cpu: */
+ struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct list_head leaf_rt_rq_list;
+#endif
+
+ /*
+ * This is part of a global counter where only the total sum
+ * over all CPUs matters. A task can increase this counter on
+ * one CPU and if it got migrated afterwards it may decrease
+ * it on another CPU. Always updated under the runqueue lock:
+ */
+ unsigned long nr_uninterruptible;
+
+ struct task_struct *curr, *idle;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+ u64 clock;
+
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ /* For active balancing */
+ int active_balance;
+ int push_cpu;
+ /* cpu of this runqueue: */
+ int cpu;
+ int online;
+
+ unsigned long avg_load_per_task;
+
+ struct task_struct *migration_thread;
+ struct list_head migration_queue;
+#endif
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+ int hrtick_csd_pending;
+ struct call_single_data hrtick_csd;
+#endif
+ struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+ /* latency stats */
+ struct sched_info rq_sched_info;
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_exp_empty;
+ unsigned int yld_act_empty;
+ unsigned int yld_both_empty;
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+
+ /* BKL stats */
+ unsigned int bkl_count;
+#endif
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+{
+ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+}
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() (&__get_cpu_var(runqueues))
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+
+static inline void update_rq_clock(struct rq *rq)
+{
+ rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+ int cpu = get_cpu();
+ struct rq *rq = cpu_rq(cpu);
+ int ret;
+
+ ret = spin_is_locked(&rq->lock);
+ put_cpu();
+ return ret;
+}
+
+/*
+ * Debugging: various feature bits
+ */
+
+#define SCHED_FEAT(name, enabled) \
+ __SCHED_FEAT_##name ,
+
+enum {
+#include "sched_features.h"
+};
+
+#undef SCHED_FEAT
+
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+
+const_debug unsigned int sysctl_sched_features =
+#include "sched_features.h"
+ 0;
+
+#undef SCHED_FEAT
+
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static __read_mostly char *sched_feat_names[] = {
+#include "sched_features.h"
+ NULL
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t
+sched_feat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *buf;
+ int r = 0;
+ int len = 0;
+ int i;
+
+ for (i = 0; sched_feat_names[i]; i++) {
+ len += strlen(sched_feat_names[i]);
+ len += 4;
+ }
+
+ buf = kmalloc(len + 2, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = 0; sched_feat_names[i]; i++) {
+ if (sysctl_sched_features & (1UL << i))
+ r += sprintf(buf + r, "%s ", sched_feat_names[i]);
+ else
+ r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+ }
+
+ r += sprintf(buf + r, "\n");
+ WARN_ON(r >= len + 2);
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+ kfree(buf);
+
+ return r;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp = buf;
+ int neg = 0;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ for (i = 0; sched_feat_names[i]; i++) {
+ int len = strlen(sched_feat_names[i]);
+
+ if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+ if (neg)
+ sysctl_sched_features &= ~(1UL << i);
+ else
+ sysctl_sched_features |= (1UL << i);
+ break;
+ }
+ }
+
+ if (!sched_feat_names[i])
+ return -EINVAL;
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .read = sched_feat_read,
+ .write = sched_feat_write,
+};
+
+static __init int sched_init_debug(void)
+{
+ debugfs_create_file("sched_features", 0644, NULL, NULL,
+ &sched_feat_fops);
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+#endif
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
+ * ratelimit for updating the group shares.
+ * default: 0.25ms
+ */
+unsigned int sysctl_sched_shares_ratelimit = 250000;
+
+/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
+ * period over which we measure -rt task cpu usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+static __read_mostly int scheduler_running;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+static inline u64 global_rt_period(void)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+ if (sysctl_sched_rt_runtime < 0)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+ return task_current(rq, p);
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+ spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->oncpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ spin_unlock_irq(&rq->lock);
+#else
+ spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ for (;;) {
+ struct rq *rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
+ spin_unlock(&rq->lock);
+ }
+}
+
+/*
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts. Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
+ */
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ local_irq_save(*flags);
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
+ spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+}
+
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
+static void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ spin_unlock(&rq->lock);
+}
+
+static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+ __releases(rq->lock)
+{
+ spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+/*
+ * this_rq_lock - lock this runqueue and disable interrupts.
+ */
+static struct rq *this_rq_lock(void)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ spin_lock(&rq->lock);
+
+ return rq;
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+
+/*
+ * Use hrtick when:
+ * - enabled by features
+ * - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ if (!cpu_active(cpu_of(rq)))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+ if (hrtimer_active(&rq->hrtick_timer))
+ hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+ struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+ spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
+{
+ struct rq *rq = arg;
+
+ spin_lock(&rq->lock);
+ hrtimer_restart(&rq->hrtick_timer);
+ rq->hrtick_csd_pending = 0;
+ spin_unlock(&rq->lock);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+
+ hrtimer_set_expires(timer, time);
+
+ if (rq == this_rq()) {
+ hrtimer_restart(timer);
+ } else if (!rq->hrtick_csd_pending) {
+ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+ rq->hrtick_csd_pending = 1;
+ }
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (int)(long)hcpu;
+
+ switch (action) {
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ hrtick_clear(cpu_rq(cpu));
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static __init void init_hrtick(void)
+{
+ hotcpu_notifier(hotplug_hrtick, 0);
+}
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void init_rq_hrtick(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ rq->hrtick_csd_pending = 0;
+
+ rq->hrtick_csd.flags = 0;
+ rq->hrtick_csd.func = __hrtick_start;
+ rq->hrtick_csd.info = rq;
+#endif
+
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrtick_timer.function = hrtick;
+ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
+}
+#else /* CONFIG_SCHED_HRTICK */
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SCHED_HRTICK */
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+ int cpu;
+
+ assert_spin_locked(&task_rq(p)->lock);
+
+ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ return;
+
+ set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(p))
+ smp_send_reschedule(cpu);
+}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ /*
+ * This is safe, as this function is called with the timer
+ * wheel base lock of (cpu) held. When the CPU is on the way
+ * to idle and has not yet set rq->curr to idle then it will
+ * be serialized on the timer wheel base lock and take the new
+ * timer into account automatically.
+ */
+ if (rq->curr != rq->idle)
+ return;
+
+ /*
+ * We can set TIF_RESCHED on the idle task of the other CPU
+ * lockless. The worst case is that the other CPU runs the
+ * idle task through an additional NOOP schedule()
+ */
+ set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(rq->idle))
+ smp_send_reschedule(cpu);
+}
+#endif /* CONFIG_NO_HZ */
+
+#else /* !CONFIG_SMP */
+static void resched_task(struct task_struct *p)
+{
+ assert_spin_locked(&task_rq(p)->lock);
+ set_tsk_need_resched(p);
+}
+#endif /* CONFIG_SMP */
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST (~0UL)
+#else
+# define WMULT_CONST (1UL << 32)
+#endif
+
+#define WMULT_SHIFT 32
+
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ struct load_weight *lw)
+{
+ u64 tmp;
+
+ if (!lw->inv_weight) {
+ if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else
+ lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+ / (lw->weight+1);
+ }
+
+ tmp = (u64)delta_exec * weight;
+ /*
+ * Check whether we'd overflow the 64-bit multiplication:
+ */
+ if (unlikely(tmp > WMULT_CONST))
+ tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+ WMULT_SHIFT/2);
+ else
+ tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+#define WEIGHT_IDLEPRIO 2
+#define WMULT_IDLEPRIO (1 << 31)
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+
+/*
+ * runqueue iterator, to support SMP load-balancing between different
+ * scheduling classes, without having to expose their internal data
+ * structures to the load-balancing proper:
+ */
+struct rq_iterator {
+ void *arg;
+ struct task_struct *(*start)(void *);
+ struct task_struct *(*next)(void *);
+};
+
+#ifdef CONFIG_SMP
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned,
+ int *this_best_prio, struct rq_iterator *iterator);
+
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ struct rq_iterator *iterator);
+#endif
+
+#ifdef CONFIG_CGROUP_CPUACCT
+static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+#endif
+
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+ update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+ update_load_sub(&rq->load, load);
+}
+
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ struct task_group *parent, *child;
+ int ret;
+
+ rcu_read_lock();
+ parent = &root_task_group;
+down:
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out_unlock;
+ list_for_each_entry_rcu(child, &parent->children, siblings) {
+ parent = child;
+ goto down;
+
+up:
+ continue;
+ }
+ ret = (*up)(parent, data);
+ if (ret)
+ goto out_unlock;
+
+ child = parent;
+ parent = parent->parent;
+ if (parent)
+ goto up;
+out_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+
+ if (nr_running)
+ rq->avg_load_per_task = rq->load.weight / nr_running;
+ else
+ rq->avg_load_per_task = 0;
+
+ return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
+{
+ int boost = 0;
+ unsigned long shares;
+ unsigned long rq_weight;
+
+ if (!tg->se[cpu])
+ return;
+
+ rq_weight = tg->cfs_rq[cpu]->load.weight;
+
+ /*
+ * If there are currently no tasks on the cpu pretend there is one of
+ * average load so that when a new task gets to run here it will not
+ * get delayed by group starvation.
+ */
+ if (!rq_weight) {
+ boost = 1;
+ rq_weight = NICE_0_LOAD;
+ }
+
+ if (unlikely(rq_weight > sd_rq_weight))
+ rq_weight = sd_rq_weight;
+
+ /*
+ * \Sum shares * rq_weight
+ * shares = -----------------------
+ * \Sum rq_weight
+ *
+ */
+ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
+
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ tg->cfs_rq[cpu]->rq_weight = rq_weight;
+
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+/*
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
+ */
+static int tg_shares_up(struct task_group *tg, void *data)
+{
+ unsigned long rq_weight = 0;
+ unsigned long shares = 0;
+ struct sched_domain *sd = data;
+ int i;
+
+ for_each_cpu_mask(i, sd->span) {
+ rq_weight += tg->cfs_rq[i]->load.weight;
+ shares += tg->cfs_rq[i]->shares;
+ }
+
+ if ((!shares && rq_weight) || shares > tg->shares)
+ shares = tg->shares;
+
+ if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+ shares = tg->shares;
+
+ if (!rq_weight)
+ rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
+
+ return 0;
+}
+
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+ unsigned long load;
+ long cpu = (long)data;
+
+ if (!tg->parent) {
+ load = cpu_rq(cpu)->load.weight;
+ } else {
+ load = tg->parent->cfs_rq[cpu]->h_load;
+ load *= tg->cfs_rq[cpu]->shares;
+ load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+ }
+
+ tg->cfs_rq[cpu]->h_load = load;
+
+ return 0;
+}
+
+static void update_shares(struct sched_domain *sd)
+{
+ u64 now = cpu_clock(raw_smp_processor_id());
+ s64 elapsed = now - sd->last_update;
+
+ if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+ sd->last_update = now;
+ walk_tg_tree(tg_nop, tg_shares_up, sd);
+ }
+}
+
+static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+ spin_unlock(&rq->lock);
+ update_shares(sd);
+ spin_lock(&rq->lock);
+}
+
+static void update_h_load(long cpu)
+{
+ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
+
+#else
+
+static inline void update_shares(struct sched_domain *sd)
+{
+}
+
+static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+}
+
+#endif
+
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+#ifdef CONFIG_SMP
+ cfs_rq->shares = shares;
+#endif
+}
+#endif
+
+#include "sched_stats.h"
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+
+#define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
+
+static void inc_nr_running(struct rq *rq)
+{
+ rq->nr_running++;
+}
+
+static void dec_nr_running(struct rq *rq)
+{
+ rq->nr_running--;
+}
+
+static void set_load_weight(struct task_struct *p)
+{
+ if (task_has_rt_policy(p)) {
+ p->se.load.weight = prio_to_weight[0] * 2;
+ p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ return;
+ }
+
+ /*
+ * SCHED_IDLE tasks get minimal weight:
+ */
+ if (p->policy == SCHED_IDLE) {
+ p->se.load.weight = WEIGHT_IDLEPRIO;
+ p->se.load.inv_weight = WMULT_IDLEPRIO;
+ return;
+ }
+
+ p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+ p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ sched_info_queued(p);
+ p->sched_class->enqueue_task(rq, p, wakeup);
+ p->se.on_rq = 1;
+}
+
+static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+ if (sleep && p->se.last_wakeup) {
+ update_avg(&p->se.avg_overlap,
+ p->se.sum_exec_runtime - p->se.last_wakeup);
+ p->se.last_wakeup = 0;
+ }
+
+ sched_info_dequeued(p);
+ p->sched_class->dequeue_task(rq, p, sleep);
+ p->se.on_rq = 0;
+}
+
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
+{
+ return p->static_prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ int prio;
+
+ if (task_has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible--;
+
+ enqueue_task(rq, p, wakeup);
+ inc_nr_running(rq);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible++;
+
+ dequeue_task(rq, p, sleep);
+ dec_nr_running(rq);
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfuly executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+ task_thread_info(p)->cpu = cpu;
+#endif
+}
+
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio, int running)
+{
+ if (prev_class != p->sched_class) {
+ if (prev_class->switched_from)
+ prev_class->switched_from(rq, p, running);
+ p->sched_class->switched_to(rq, p, running);
+ } else
+ p->sched_class->prio_changed(rq, p, oldprio, running);
+}
+
+#ifdef CONFIG_SMP
+
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+ s64 delta;
+
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
+ return 1;
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = now - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+ int old_cpu = task_cpu(p);
+ struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+ struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+ *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+ u64 clock_offset;
+
+ clock_offset = old_rq->clock - new_rq->clock;
+
+#ifdef CONFIG_SCHEDSTATS
+ if (p->se.wait_start)
+ p->se.wait_start -= clock_offset;
+ if (p->se.sleep_start)
+ p->se.sleep_start -= clock_offset;
+ if (p->se.block_start)
+ p->se.block_start -= clock_offset;
+ if (old_cpu != new_cpu) {
+ schedstat_inc(p, se.nr_migrations);
+ if (task_hot(p, old_rq->clock, NULL))
+ schedstat_inc(p, se.nr_forced2_migrations);
+ }
+#endif
+ p->se.vruntime -= old_cfsrq->min_vruntime -
+ new_cfsrq->min_vruntime;
+
+ __set_task_cpu(p, new_cpu);
+}
+
+struct migration_req {
+ struct list_head list;
+
+ struct task_struct *task;
+ int dest_cpu;
+
+ struct completion done;
+};
+
+/*
+ * The task's runqueue lock must be held.
+ * Returns true if you have to wait for migration thread.
+ */
+static int
+migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+{
+ struct rq *rq = task_rq(p);
+
+ /*
+ * If the task is not on a runqueue (and not running), then
+ * it is sufficient to simply update the task's cpu field.
+ */
+ if (!p->se.on_rq && !task_running(rq, p)) {
+ set_task_cpu(p, dest_cpu);
+ return 0;
+ }
+
+ init_completion(&req->done);
+ req->task = p;
+ req->dest_cpu = dest_cpu;
+ list_add(&req->list, &rq->migration_queue);
+
+ return 1;
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ unsigned long flags;
+ int running, on_rq;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(rq, p);
+ running = task_running(rq, p);
+ on_rq = p->se.on_rq;
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it wa still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(on_rq)) {
+ schedule_timeout_uninterruptible(1);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesnt have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_send_reschedule(cpu);
+ preempt_enable();
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+ struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ continue;
+
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu_mask_nr(i, group->cpumask) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ avg_load += load;
+ }
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ } else if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+ } while (group = group->next, group != sd->groups);
+
+ if (!idlest || 100*this_load < imbalance*min_load)
+ return NULL;
+ return idlest;
+}
+
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+ cpumask_t *tmp)
+{
+ unsigned long load, min_load = ULONG_MAX;
+ int idlest = -1;
+ int i;
+
+ /* Traverse only the allowed CPUs */
+ cpus_and(*tmp, group->cpumask, p->cpus_allowed);
+
+ for_each_cpu_mask_nr(i, *tmp) {
+ load = weighted_cpuload(i);
+
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ idlest = i;
+ }
+ }
+
+ return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
+
+ for_each_domain(cpu, tmp) {
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
+ if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+
+ if (sd)
+ update_shares(sd);
+
+ while (sd) {
+ cpumask_t span, tmpmask;
+ struct sched_group *group;
+ int new_cpu, weight;
+
+ if (!(sd->flags & flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ span = sd->span;
+ group = find_idlest_group(sd, t, cpu);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ sd = NULL;
+ weight = cpus_weight(span);
+ for_each_domain(cpu, tmp) {
+ if (weight <= cpus_weight(tmp->span))
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return cpu;
+}
+
+#endif /* CONFIG_SMP */
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the to-be-woken-up thread
+ * @state: the mask of task states that can be woken
+ * @sync: do a synchronous wakeup?
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * returns failure only if the task is already active.
+ */
+static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+{
+ int cpu, orig_cpu, this_cpu, success = 0;
+ unsigned long flags;
+ long old_state;
+ struct rq *rq;
+
+ if (!sched_feat(SYNC_WAKEUPS))
+ sync = 0;
+
+#ifdef CONFIG_SMP
+ if (sched_feat(LB_WAKEUP_UPDATE)) {
+ struct sched_domain *sd;
+
+ this_cpu = raw_smp_processor_id();
+ cpu = task_cpu(p);
+
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ update_shares(sd);
+ break;
+ }
+ }
+ }
+#endif
+
+ smp_wmb();
+ rq = task_rq_lock(p, &flags);
+ old_state = p->state;
+ if (!(old_state & state))
+ goto out;
+
+ if (p->se.on_rq)
+ goto out_running;
+
+ cpu = task_cpu(p);
+ orig_cpu = cpu;
+ this_cpu = smp_processor_id();
+
+#ifdef CONFIG_SMP
+ if (unlikely(task_running(rq, p)))
+ goto out_activate;
+
+ cpu = p->sched_class->select_task_rq(p, sync);
+ if (cpu != orig_cpu) {
+ set_task_cpu(p, cpu);
+ task_rq_unlock(rq, &flags);
+ /* might preempt at this point */
+ rq = task_rq_lock(p, &flags);
+ old_state = p->state;
+ if (!(old_state & state))
+ goto out;
+ if (p->se.on_rq)
+ goto out_running;
+
+ this_cpu = smp_processor_id();
+ cpu = task_cpu(p);
+ }
+
+#ifdef CONFIG_SCHEDSTATS
+ schedstat_inc(rq, ttwu_count);
+ if (cpu == this_cpu)
+ schedstat_inc(rq, ttwu_local);
+ else {
+ struct sched_domain *sd;
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif /* CONFIG_SCHEDSTATS */
+
+out_activate:
+#endif /* CONFIG_SMP */
+ schedstat_inc(p, se.nr_wakeups);
+ if (sync)
+ schedstat_inc(p, se.nr_wakeups_sync);
+ if (orig_cpu != cpu)
+ schedstat_inc(p, se.nr_wakeups_migrate);
+ if (cpu == this_cpu)
+ schedstat_inc(p, se.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.nr_wakeups_remote);
+ update_rq_clock(rq);
+ activate_task(rq, p, 1);
+ success = 1;
+
+out_running:
+ trace_sched_wakeup(rq, p);
+ check_preempt_curr(rq, p, sync);
+
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_wake_up)
+ p->sched_class->task_wake_up(rq, p);
+#endif
+out:
+ current->se.last_wakeup = current->se.sum_exec_runtime;
+
+ task_rq_unlock(rq, &flags);
+
+ return success;
+}
+
+int wake_up_process(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_ALL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ *
+ * __sched_fork() is basic setup used by init_idle() too:
+ */
+static void __sched_fork(struct task_struct *p)
+{
+ p->se.exec_start = 0;
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->se.last_wakeup = 0;
+ p->se.avg_overlap = 0;
+
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_start = 0;
+ p->se.sum_sleep_runtime = 0;
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;
+ p->se.sleep_max = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+ p->se.wait_max = 0;
+#endif
+
+ INIT_LIST_HEAD(&p->rt.run_list);
+ p->se.on_rq = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
+ /*
+ * We mark the process as running here, but have not actually
+ * inserted it onto the runqueue yet. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_RUNNING;
+}
+
+/*
+ * fork()/clone()-time setup:
+ */
+void sched_fork(struct task_struct *p, int clone_flags)
+{
+ int cpu = get_cpu();
+
+ __sched_fork(p);
+
+#ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+ set_task_cpu(p, cpu);
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child:
+ */
+ p->prio = current->normal_prio;
+ if (!rt_prio(p->prio))
+ p->sched_class = &fair_sched_class;
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ if (likely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ p->oncpu = 0;
+#endif
+#ifdef CONFIG_PREEMPT
+ /* Want to start with kernel preemption disabled. */
+ task_thread_info(p)->preempt_count = 1;
+#endif
+ put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+{
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ BUG_ON(p->state != TASK_RUNNING);
+ update_rq_clock(rq);
+
+ p->prio = effective_prio(p);
+
+ if (!p->sched_class->task_new || !current->se.on_rq) {
+ activate_task(rq, p, 0);
+ } else {
+ /*
+ * Let the scheduling class do new task startup
+ * management (if any):
+ */
+ p->sched_class->task_new(rq, p);
+ inc_nr_running(rq);
+ }
+ trace_sched_wakeup_new(rq, p);
+ check_preempt_curr(rq, p, 0);
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_wake_up)
+ p->sched_class->task_wake_up(rq, p);
+#endif
+ task_rq_unlock(rq, &flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ fire_sched_out_preempt_notifiers(prev, next);
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
+ * still held, otherwise prev could be scheduled on another cpu, die
+ * there before we look at prev->state, and then the reference would
+ * be dropped twice.
+ * Manfred Spraul <manfred@colorfullife.com>
+ */
+ prev_state = prev->state;
+ finish_arch_switch(prev);
+ finish_lock_switch(rq, prev);
+#ifdef CONFIG_SMP
+ if (current->sched_class->post_schedule)
+ current->sched_class->post_schedule(rq);
+#endif
+
+ fire_sched_in_preempt_notifiers(current);
+ if (mm)
+ mmdrop(mm);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+ put_task_struct(prev);
+ }
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+
+ finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+ /* In this case, finish_task_switch does not reenable preemption */
+ preempt_enable();
+#endif
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct mm_struct *mm, *oldmm;
+
+ prepare_task_switch(rq, prev, next);
+ trace_sched_switch(rq, prev, next);
+ mm = next->mm;
+ oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_enter_lazy_cpu_mode();
+
+ if (unlikely(!mm)) {
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next);
+ } else
+ switch_mm(oldmm, mm, next);
+
+ if (unlikely(!prev->mm)) {
+ prev->active_mm = NULL;
+ rq->prev_mm = oldmm;
+ }
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
+
+ /* Here we just switch the register state and the stack. */
+ switch_to(prev, next, prev);
+
+ barrier();
+ /*
+ * this_rq must be evaluated again because prev may have moved
+ * CPUs since it called schedule(), thus the 'rq' on its stack
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_running;
+
+ return sum;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += cpu_rq(i)->nr_uninterruptible;
+
+ /*
+ * Since we read the counters lockless, it might be slightly
+ * inaccurate. Do not allow it to go below zero though:
+ */
+ if (unlikely((long)sum < 0))
+ sum = 0;
+
+ return sum;
+}
+
+unsigned long long nr_context_switches(void)
+{
+ int i;
+ unsigned long long sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += cpu_rq(i)->nr_switches;
+
+ return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+ return sum;
+}
+
+unsigned long nr_active(void)
+{
+ unsigned long i, running = 0, uninterruptible = 0;
+
+ for_each_online_cpu(i) {
+ running += cpu_rq(i)->nr_running;
+ uninterruptible += cpu_rq(i)->nr_uninterruptible;
+ }
+
+ if (unlikely((long)uninterruptible < 0))
+ uninterruptible = 0;
+
+ return running + uninterruptible;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC).
+ */
+static void update_cpu_load(struct rq *this_rq)
+{
+ unsigned long this_load = this_rq->load.weight;
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ }
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+ if (rq1 < rq2) {
+ spin_lock(&rq1->lock);
+ spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&rq2->lock);
+ spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+ }
+ update_rq_clock(rq1);
+ update_rq_clock(rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+ if (unlikely(!spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ spin_unlock(&this_rq->lock);
+ spin_lock(&busiest->lock);
+ spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu. Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(struct task_struct *p, int dest_cpu)
+{
+ struct migration_req req;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ if (!cpu_isset(dest_cpu, p->cpus_allowed)
+ || unlikely(!cpu_active(dest_cpu)))
+ goto out;
+
+ trace_sched_migrate_task(rq, p, dest_cpu);
+ /* force the process onto the specified CPU */
+ if (migrate_task(p, dest_cpu, &req)) {
+ /* Need to wait for migration thread (might exit: take ref). */
+ struct task_struct *mt = rq->migration_thread;
+
+ get_task_struct(mt);
+ task_rq_unlock(rq, &flags);
+ wake_up_process(mt);
+ put_task_struct(mt);
+ wait_for_completion(&req.done);
+
+ return;
+ }
+out:
+ task_rq_unlock(rq, &flags);
+}
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+ int new_cpu, this_cpu = get_cpu();
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+ put_cpu();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
+}
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+ struct rq *this_rq, int this_cpu)
+{
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ /*
+ * Note that idle threads have a prio of MAX_PRIO, for this test
+ * to be always true for them.
+ */
+ check_preempt_curr(this_rq, p, 0);
+}
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned)
+{
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) are cache-hot on their current CPU.
+ */
+ if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ schedstat_inc(p, se.nr_failed_migrations_affine);
+ return 0;
+ }
+ *all_pinned = 0;
+
+ if (task_running(rq, p)) {
+ schedstat_inc(p, se.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ if (!task_hot(p, rq->clock, sd) ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(p, rq->clock, sd)) {
+ schedstat_inc(sd, lb_hot_gained[idle]);
+ schedstat_inc(p, se.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ if (task_hot(p, rq->clock, sd)) {
+ schedstat_inc(p, se.nr_failed_migrations_hot);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned,
+ int *this_best_prio, struct rq_iterator *iterator)
+{
+ int loops = 0, pulled = 0, pinned = 0;
+ struct task_struct *p;
+ long rem_load_move = max_load_move;
+
+ if (max_load_move == 0)
+ goto out;
+
+ pinned = 1;
+
+ /*
+ * Start the load-balancing iterator:
+ */
+ p = iterator->start(iterator->arg);
+next:
+ if (!p || loops++ > sysctl_sched_nr_migrate)
+ goto out;
+
+ if ((p->se.load.weight >> 1) > rem_load_move ||
+ !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+ p = iterator->next(iterator->arg);
+ goto next;
+ }
+
+ pull_task(busiest, p, this_rq, this_cpu);
+ pulled++;
+ rem_load_move -= p->se.load.weight;
+
+ /*
+ * We only want to steal up to the prescribed amount of weighted load.
+ */
+ if (rem_load_move > 0) {
+ if (p->prio < *this_best_prio)
+ *this_best_prio = p->prio;
+ p = iterator->next(iterator->arg);
+ goto next;
+ }
+out:
+ /*
+ * Right now, this is one of only two places pull_task() is called,
+ * so we can safely collect pull_task() stats here rather than
+ * inside pull_task().
+ */
+ schedstat_add(sd, lb_gained[idle], pulled);
+
+ if (all_pinned)
+ *all_pinned = pinned;
+
+ return max_load_move - rem_load_move;
+}
+
+/*
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned)
+{
+ const struct sched_class *class = sched_class_highest;
+ unsigned long total_load_moved = 0;
+ int this_best_prio = this_rq->curr->prio;
+
+ do {
+ total_load_moved +=
+ class->load_balance(this_rq, this_cpu, busiest,
+ max_load_move - total_load_moved,
+ sd, idle, all_pinned, &this_best_prio);
+ class = class->next;
+
+ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+ break;
+
+ } while (class && max_load_move > total_load_moved);
+
+ return total_load_moved > 0;
+}
+
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ struct rq_iterator *iterator)
+{
+ struct task_struct *p = iterator->start(iterator->arg);
+ int pinned = 0;
+
+ while (p) {
+ if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+ pull_task(busiest, p, this_rq, this_cpu);
+ /*
+ * Right now, this is only the second place pull_task()
+ * is called, so we can safely collect pull_task()
+ * stats here rather than inside pull_task().
+ */
+ schedstat_inc(sd, lb_gained[idle]);
+
+ return 1;
+ }
+ p = iterator->next(iterator->arg);
+ }
+
+ return 0;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ const struct sched_class *class;
+
+ for (class = sched_class_highest; class; class = class->next)
+ if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, const cpumask_t *cpus, int *balance)
+{
+ struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
+ unsigned long busiest_load_per_task, busiest_nr_running;
+ unsigned long this_load_per_task, this_nr_running;
+ int load_idx, group_imb = 0;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance = 1;
+ unsigned long leader_nr_running = 0, min_load_per_task = 0;
+ unsigned long min_nr_running = ULONG_MAX;
+ struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
+
+ max_load = this_load = total_load = total_pwr = 0;
+ busiest_load_per_task = busiest_nr_running = 0;
+ this_load_per_task = this_nr_running = 0;
+
+ if (idle == CPU_NOT_IDLE)
+ load_idx = sd->busy_idx;
+ else if (idle == CPU_NEWLY_IDLE)
+ load_idx = sd->newidle_idx;
+ else
+ load_idx = sd->idle_idx;
+
+ do {
+ unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
+ int local_group;
+ int i;
+ int __group_imb = 0;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned long sum_nr_running, sum_weighted_load;
+ unsigned long sum_avg_load_per_task;
+ unsigned long avg_load_per_task;
+
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ if (local_group)
+ balance_cpu = first_cpu(group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ sum_weighted_load = sum_nr_running = avg_load = 0;
+ sum_avg_load_per_task = avg_load_per_task = 0;
+
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;
+
+ for_each_cpu_mask_nr(i, group->cpumask) {
+ struct rq *rq;
+
+ if (!cpu_isset(i, *cpus))
+ continue;
+
+ rq = cpu_rq(i);
+
+ if (*sd_idle && rq->nr_running)
+ *sd_idle = 0;
+
+ /* Bias balancing toward cpus of our domain */
+ if (local_group) {
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
+ load = target_load(i, load_idx);
+ } else {
+ load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
+ }
+
+ avg_load += load;
+ sum_nr_running += rq->nr_running;
+ sum_weighted_load += weighted_cpuload(i);
+
+ sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ }
+
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ goto ret;
+ }
+
+ total_load += avg_load;
+ total_pwr += group->__cpu_power;
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
+
+
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of two tasks.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ avg_load_per_task = sg_div_cpu_power(group,
+ sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ __group_imb = 1;
+
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ this_nr_running = sum_nr_running;
+ this_load_per_task = sum_weighted_load;
+ } else if (avg_load > max_load &&
+ (sum_nr_running > group_capacity || __group_imb)) {
+ max_load = avg_load;
+ busiest = group;
+ busiest_nr_running = sum_nr_running;
+ busiest_load_per_task = sum_weighted_load;
+ group_imb = __group_imb;
+ }
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == CPU_NOT_IDLE ||
+ !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto group_next;
+
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (this_nr_running >= group_capacity ||
+ !this_nr_running))
+ power_savings_balance = 0;
+
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!power_savings_balance || sum_nr_running >= group_capacity
+ || !sum_nr_running)
+ goto group_next;
+
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sum_nr_running < min_nr_running) ||
+ (sum_nr_running == min_nr_running &&
+ first_cpu(group->cpumask) <
+ first_cpu(group_min->cpumask))) {
+ group_min = group;
+ min_nr_running = sum_nr_running;
+ min_load_per_task = sum_weighted_load /
+ sum_nr_running;
+ }
+
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sum_nr_running <= group_capacity - 1) {
+ if (sum_nr_running > leader_nr_running ||
+ (sum_nr_running == leader_nr_running &&
+ first_cpu(group->cpumask) >
+ first_cpu(group_leader->cpumask))) {
+ group_leader = group;
+ leader_nr_running = sum_nr_running;
+ }
+ }
+group_next:
+#endif
+ group = group->next;
+ } while (group != sd->groups);
+
+ if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+ goto out_balanced;
+
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+ if (this_load >= avg_load ||
+ 100*max_load <= sd->imbalance_pct*this_load)
+ goto out_balanced;
+
+ busiest_load_per_task /= busiest_nr_running;
+ if (group_imb)
+ busiest_load_per_task = min(busiest_load_per_task, avg_load);
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
+ */
+ if (max_load <= busiest_load_per_task)
+ goto out_balanced;
+
+ /*
+ * In the presence of smp nice balancing, certain scenarios can have
+ * max load less than avg load(as we skip the groups at or below
+ * its cpu_power, while calculating max_load..)
+ */
+ if (max_load < avg_load) {
+ *imbalance = 0;
+ goto small_imbalance;
+ }
+
+ /* Don't want to pull so many tasks that a group would go idle */
+ max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+
+ /* How much load to actually move to equalise the imbalance */
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
+ / SCHED_LOAD_SCALE;
+
+ /*
+ * if *imbalance is less than the average load per runnable task
+ * there is no gaurantee that any tasks will be moved so we'll have
+ * a think about bumping its value to force at least one task to be
+ * moved
+ */
+ if (*imbalance < busiest_load_per_task) {
+ unsigned long tmp, pwr_now, pwr_move;
+ unsigned int imbn;
+
+small_imbalance:
+ pwr_move = pwr_now = 0;
+ imbn = 2;
+ if (this_nr_running) {
+ this_load_per_task /= this_nr_running;
+ if (busiest_load_per_task > this_load_per_task)
+ imbn = 1;
+ } else
+ this_load_per_task = cpu_avg_load_per_task(this_cpu);
+
+ if (max_load - this_load + busiest_load_per_task >=
+ busiest_load_per_task * imbn) {
+ *imbalance = busiest_load_per_task;
+ return busiest;
+ }
+
+ /*
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
+ */
+
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ if (max_load > tmp)
+ pwr_move += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load - tmp);
+
+ /* Amount of load we'd add */
+ if (max_load * busiest->__cpu_power <
+ busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
+ else
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain throughput */
+ if (pwr_move > pwr_now)
+ *imbalance = busiest_load_per_task;
+ }
+
+ return busiest;
+
+out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto ret;
+
+ if (this == group_leader && group_leader != group_min) {
+ *imbalance = min_load_per_task;
+ return group_min;
+ }
+#endif
+ret:
+ *imbalance = 0;
+ return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+ unsigned long imbalance, const cpumask_t *cpus)
+{
+ struct rq *busiest = NULL, *rq;
+ unsigned long max_load = 0;
+ int i;
+
+ for_each_cpu_mask_nr(i, group->cpumask) {
+ unsigned long wl;
+
+ if (!cpu_isset(i, *cpus))
+ continue;
+
+ rq = cpu_rq(i);
+ wl = weighted_cpuload(i);
+
+ if (rq->nr_running == 1 && wl > imbalance)
+ continue;
+
+ if (wl > max_load) {
+ max_load = wl;
+ busiest = rq;
+ }
+ }
+
+ return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *balance, cpumask_t *cpus)
+{
+ int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ struct sched_group *group;
+ unsigned long imbalance;
+ struct rq *busiest;
+ unsigned long flags;
+
+ cpus_setall(*cpus);
+
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as CPU_IDLE, instead of
+ * portraying it as CPU_NOT_IDLE.
+ */
+ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ sd_idle = 1;
+
+ schedstat_inc(sd, lb_count[idle]);
+
+redo:
+ update_shares(sd);
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ cpus, balance);
+
+ if (*balance == 0)
+ goto out_balanced;
+
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[idle]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ if (!busiest) {
+ schedstat_inc(sd, lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+ BUG_ON(busiest == this_rq);
+
+ schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+ ld_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. ld_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ local_irq_save(flags);
+ double_rq_lock(this_rq, busiest);
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, idle, &all_pinned);
+ double_rq_unlock(this_rq, busiest);
+ local_irq_restore(flags);
+
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (ld_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(all_pinned)) {
+ cpu_clear(cpu_of(busiest), *cpus);
+ if (!cpus_empty(*cpus))
+ goto redo;
+ goto out_balanced;
+ }
+ }
+
+ if (!ld_moved) {
+ schedstat_inc(sd, lb_failed[idle]);
+ sd->nr_balance_failed++;
+
+ if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+
+ spin_lock_irqsave(&busiest->lock, flags);
+
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock_irqrestore(&busiest->lock, flags);
+ all_pinned = 1;
+ goto out_one_pinned;
+ }
+
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ active_balance = 1;
+ }
+ spin_unlock_irqrestore(&busiest->lock, flags);
+ if (active_balance)
+ wake_up_process(busiest->migration_thread);
+
+ /*
+ * We've kicked active balancing, reset the failure
+ * counter.
+ */
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
+ }
+ } else
+ sd->nr_balance_failed = 0;
+
+ if (likely(!active_balance)) {
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * move_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+ }
+
+ if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ ld_moved = -1;
+
+ goto out;
+
+out_balanced:
+ schedstat_inc(sd, lb_balanced[idle]);
+
+ sd->nr_balance_failed = 0;
+
+out_one_pinned:
+ /* tune up the balancing interval */
+ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
+ sd->balance_interval *= 2;
+
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ ld_moved = -1;
+ else
+ ld_moved = 0;
+out:
+ if (ld_moved)
+ update_shares(sd);
+ return ld_moved;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
+ cpumask_t *cpus)
+{
+ struct sched_group *group;
+ struct rq *busiest = NULL;
+ unsigned long imbalance;
+ int ld_moved = 0;
+ int sd_idle = 0;
+ int all_pinned = 0;
+
+ cpus_setall(*cpus);
+
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as IDLE, instead of
+ * portraying it as CPU_NOT_IDLE.
+ */
+ if (sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ sd_idle = 1;
+
+ schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
+redo:
+ update_shares_locked(this_rq, sd);
+ group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
+ &sd_idle, cpus, NULL);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
+ if (!busiest) {
+ schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
+ goto out_balanced;
+ }
+
+ BUG_ON(busiest == this_rq);
+
+ schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
+
+ ld_moved = 0;
+ if (busiest->nr_running > 1) {
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+ /* this_rq->clock is already updated */
+ update_rq_clock(busiest);
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, CPU_NEWLY_IDLE,
+ &all_pinned);
+ double_unlock_balance(this_rq, busiest);
+
+ if (unlikely(all_pinned)) {
+ cpu_clear(cpu_of(busiest), *cpus);
+ if (!cpus_empty(*cpus))
+ goto redo;
+ }
+ }
+
+ if (!ld_moved) {
+ schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ return -1;
+ } else
+ sd->nr_balance_failed = 0;
+
+ update_shares_locked(this_rq, sd);
+ return ld_moved;
+
+out_balanced:
+ schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+ return -1;
+ sd->nr_balance_failed = 0;
+
+ return 0;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+ struct sched_domain *sd;
+ int pulled_task = -1;
+ unsigned long next_balance = jiffies + HZ;
+ cpumask_t tmpmask;
+
+ for_each_domain(this_cpu, sd) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE)
+ /* If we've pulled tasks over stop searching: */
+ pulled_task = load_balance_newidle(this_cpu, this_rq,
+ sd, &tmpmask);
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+ if (pulled_task)
+ break;
+ }
+ if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+ /*
+ * We are going idle. next_balance may be set based on
+ * a busy processor. So reset next_balance.
+ */
+ this_rq->next_balance = next_balance;
+ }
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+{
+ int target_cpu = busiest_rq->push_cpu;
+ struct sched_domain *sd;
+ struct rq *target_rq;
+
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ return;
+
+ target_rq = cpu_rq(target_cpu);
+
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+ update_rq_clock(busiest_rq);
+ update_rq_clock(target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ for_each_domain(target_cpu, sd) {
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpu_isset(busiest_cpu, sd->span))
+ break;
+ }
+
+ if (likely(sd)) {
+ schedstat_inc(sd, alb_count);
+
+ if (move_one_task(target_rq, target_cpu, busiest_rq,
+ sd, CPU_IDLE))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ double_unlock_balance(busiest_rq, target_rq);
+}
+
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (!cpu_active(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long interval;
+ struct sched_domain *sd;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
+ int need_serialize;
+ cpumask_t tmp;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ interval = sd->balance_interval;
+ if (idle != CPU_IDLE)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ if (unlikely(!interval))
+ interval = 1;
+ if (interval > HZ*NR_CPUS/10)
+ interval = HZ*NR_CPUS/10;
+
+ need_serialize = sd->flags & SD_SERIALIZE;
+
+ if (need_serialize) {
+ if (!spin_trylock(&balancing))
+ goto out;
+ }
+
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ /*
+ * We've pulled tasks over so either we're no
+ * longer idle, or one of our SMT siblings is
+ * not idle.
+ */
+ idle = CPU_NOT_IDLE;
+ }
+ sd->last_balance = jiffies;
+ }
+ if (need_serialize)
+ spin_unlock(&balancing);
+out:
+ if (time_after(next_balance, sd->last_balance + interval)) {
+ next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!balance)
+ break;
+ }
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int this_cpu = smp_processor_id();
+ struct rq *this_rq = cpu_rq(this_cpu);
+ enum cpu_idle_type idle = this_rq->idle_at_tick ?
+ CPU_IDLE : CPU_NOT_IDLE;
+
+ rebalance_domains(this_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (this_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == this_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(this_cpu, cpus);
+ for_each_cpu_mask_nr(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb < nr_cpu_ids)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+}
+
+#else /* CONFIG_SMP */
+
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
+unsigned long long task_delta_exec(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (task_current(rq, p)) {
+ u64 delta_exec;
+
+ update_rq_clock(rq);
+ delta_exec = rq->clock - p->se.exec_start;
+ if ((s64)delta_exec > 0)
+ ns = delta_exec;
+ }
+
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ cputime64_t tmp;
+
+ p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
+
+ /* Add user time to cpustat. */
+ tmp = cputime_to_cputime64(cputime);
+ if (TASK_NICE(p) > 0)
+ cpustat->nice = cputime64_add(cpustat->nice, tmp);
+ else
+ cpustat->user = cputime64_add(cpustat->user, tmp);
+ /* Account for user time used */
+ acct_update_integrals(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime)
+{
+ cputime64_t tmp;
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ tmp = cputime_to_cputime64(cputime);
+
+ p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
+ p->gtime = cputime_add(p->gtime, cputime);
+
+ cpustat->user = cputime64_add(cpustat->user, tmp);
+ cpustat->guest = cputime64_add(cpustat->guest, tmp);
+}
+
+/*
+ * Account scaled user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->utimescaled = cputime_add(p->utimescaled, cputime);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ struct rq *rq = this_rq();
+ cputime64_t tmp;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime);
+ return;
+ }
+
+ p->stime = cputime_add(p->stime, cputime);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ tmp = cputime_to_cputime64(cputime);
+ if (hardirq_count() - hardirq_offset)
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ else if (softirq_count())
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ else if (p != rq->idle)
+ cpustat->system = cputime64_add(cpustat->system, tmp);
+ else if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+ else
+ cpustat->idle = cputime64_add(cpustat->idle, tmp);
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
+ * Account scaled system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+{
+ p->stimescaled = cputime_add(p->stimescaled, cputime);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @p: the process from which the cpu time has been stolen
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(struct task_struct *p, cputime_t steal)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ cputime64_t tmp = cputime_to_cputime64(steal);
+ struct rq *rq = this_rq();
+
+ if (p == rq->idle) {
+ p->stime = cputime_add(p->stime, steal);
+ account_group_system_time(p, steal);
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+ else
+ cpustat->idle = cputime64_add(cpustat->idle, tmp);
+ } else
+ cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+ return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+ u64 temp;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+
+ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+ return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ clock_t stime;
+
+ /*
+ * Use CFS's precise accounting. (we subtract utime from
+ * the total, to make sure the total observed by userspace
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+ if (stime >= 0)
+ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+ return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr = rq->curr;
+
+ sched_clock_tick();
+
+ spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_cpu_load(rq);
+ curr->sched_class->task_tick(rq, curr, 0);
+ spin_unlock(&rq->lock);
+
+#ifdef CONFIG_SMP
+ rq->idle_at_tick = idle_cpu(cpu);
+ trigger_load_balance(rq, cpu);
+#endif
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+ if (in_lock_functions(addr)) {
+ addr = CALLER_ADDR2;
+ if (in_lock_functions(addr))
+ addr = CALLER_ADDR3;
+ }
+ return addr;
+}
+
+void __kprobes add_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ if (preempt_count() == val)
+ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void __kprobes sub_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ struct pt_regs *regs = get_irq_regs();
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+ * schedule() atomically, we ignore that path for now.
+ * Otherwise, whine if we are scheduling when we should not be.
+ */
+ if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+ __schedule_bug(prev);
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq(), sched_count);
+#ifdef CONFIG_SCHEDSTATS
+ if (unlikely(prev->lock_depth >= 0)) {
+ schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(prev, sched_info.bkl_count);
+ }
+#endif
+}
+
+/*
+ * Pick up the highest-prio task:
+ */
+static inline struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev)
+{
+ const struct sched_class *class;
+ struct task_struct *p;
+
+ /*
+ * Optimization: we know that if all tasks are in
+ * the fair class we can call that function directly:
+ */
+ if (likely(rq->nr_running == rq->cfs.nr_running)) {
+ p = fair_sched_class.pick_next_task(rq);
+ if (likely(p))
+ return p;
+ }
+
+ class = sched_class_highest;
+ for ( ; ; ) {
+ p = class->pick_next_task(rq);
+ if (p)
+ return p;
+ /*
+ * Will never be NULL as the idle class always
+ * returns a non-NULL p:
+ */
+ class = class->next;
+ }
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+ struct task_struct *prev, *next;
+ unsigned long *switch_count;
+ struct rq *rq;
+ int cpu;
+
+need_resched:
+ preempt_disable();
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ rcu_qsctr_inc(cpu);
+ prev = rq->curr;
+ switch_count = &prev->nivcsw;
+
+ release_kernel_lock(prev);
+need_resched_nonpreemptible:
+
+ schedule_debug(prev);
+
+ if (sched_feat(HRTICK))
+ hrtick_clear(rq);
+
+ spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ clear_tsk_need_resched(prev);
+
+ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (unlikely(signal_pending_state(prev->state, prev)))
+ prev->state = TASK_RUNNING;
+ else
+ deactivate_task(rq, prev, 1);
+ switch_count = &prev->nvcsw;
+ }
+
+#ifdef CONFIG_SMP
+ if (prev->sched_class->pre_schedule)
+ prev->sched_class->pre_schedule(rq, prev);
+#endif
+
+ if (unlikely(!rq->nr_running))
+ idle_balance(cpu, rq);
+
+ prev->sched_class->put_prev_task(rq, prev);
+ next = pick_next_task(rq, prev);
+
+ if (likely(prev != next)) {
+ sched_info_switch(prev, next);
+
+ rq->nr_switches++;
+ rq->curr = next;
+ ++*switch_count;
+
+ context_switch(rq, prev, next); /* unlocks the rq */
+ /*
+ * the context switch might have flipped the stack from under
+ * us, hence refresh the local variables.
+ */
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ } else
+ spin_unlock_irq(&rq->lock);
+
+ if (unlikely(reacquire_kernel_lock(current) < 0))
+ goto need_resched_nonpreemptible;
+
+ preempt_enable_no_resched();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
+}
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched preempt_schedule(void)
+{
+ struct thread_info *ti = current_thread_info();
+
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(ti->preempt_count || irqs_disabled()))
+ return;
+
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+ schedule();
+ sub_preempt_count(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+}
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+ struct thread_info *ti = current_thread_info();
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(ti->preempt_count || !irqs_disabled());
+
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ sub_preempt_count(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, sync);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int sync, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, sync, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+ __wake_up_common(q, mode, 1, 0, NULL);
+}
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ unsigned long flags;
+ int sync = 1;
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(!nr_exclusive))
+ sync = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ wait.flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
+unsigned long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Returns: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+{
+ unsigned long flags;
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
+
+ __set_current_state(state);
+
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, &wait);
+ spin_unlock(&q->lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&q->lock);
+ __remove_wait_queue(q, &wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ return timeout;
+}
+
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+ sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void __sched sleep_on(wait_queue_head_t *q)
+{
+ sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sleep_on);
+
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ unsigned long flags;
+ int oldprio, on_rq, running;
+ struct rq *rq;
+ const struct sched_class *prev_class = p->sched_class;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+
+ oldprio = p->prio;
+ on_rq = p->se.on_rq;
+ running = task_current(rq, p);
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq) {
+ enqueue_task(rq, p, 0);
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int old_prio, delta, on_rq;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * SCHED_FIFO/SCHED_RR:
+ */
+ if (task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ goto out_unlock;
+ }
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
+
+ if (on_rq) {
+ enqueue_task(rq, p, 0);
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ resched_task(rq->curr);
+ }
+out_unlock:
+ task_rq_unlock(rq, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = 20 - nice;
+
+ return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ if (increment < -40)
+ increment = -40;
+ if (increment > 40)
+ increment = 40;
+
+ nice = PRIO_TO_NICE(current->static_prio) + increment;
+ if (nice < -20)
+ nice = -20;
+ if (nice > 19)
+ nice = 19;
+
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+ return TASK_NICE(p);
+}
+EXPORT_SYMBOL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+{
+ BUG_ON(p->se.on_rq);
+
+ p->policy = policy;
+ switch (p->policy) {
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ p->sched_class = &fair_sched_class;
+ break;
+ case SCHED_FIFO:
+ case SCHED_RR:
+ p->sched_class = &rt_sched_class;
+ break;
+ }
+
+ p->rt_priority = prio;
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ set_load_weight(p);
+}
+
+static int __sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param, bool user)
+{
+ int retval, oldprio, oldpolicy = -1, on_rq, running;
+ unsigned long flags;
+ const struct sched_class *prev_class = p->sched_class;
+ struct rq *rq;
+
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
+recheck:
+ /* double check policy once rq lock held */
+ if (policy < 0)
+ policy = oldpolicy = p->policy;
+ else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLE)
+ return -EINVAL;
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if (param->sched_priority < 0 ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ return -EINVAL;
+ if (rt_policy(policy) != (param->sched_priority != 0))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio;
+
+ if (!lock_task_sighand(p, &flags))
+ return -ESRCH;
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ unlock_task_sighand(p, &flags);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+ /*
+ * Like positive nice levels, dont allow tasks to
+ * move out of SCHED_IDLE either:
+ */
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+ return -EPERM;
+
+ /* can't change other user's priorities */
+ if ((current->euid != p->euid) &&
+ (current->euid != p->uid))
+ return -EPERM;
+ }
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0)
+ return -EPERM;
+#endif
+
+ retval = security_task_setscheduler(p, policy, param);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
+ /*
+ * To be able to change p->policy safely, the apropriate
+ * runqueue lock must be held.
+ */
+ rq = __task_rq_lock(p);
+ /* recheck policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+ goto recheck;
+ }
+ update_rq_clock(rq);
+ on_rq = p->se.on_rq;
+ running = task_current(rq, p);
+ if (on_rq)
+ deactivate_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ oldprio = p->prio;
+ __setscheduler(rq, p, policy, param->sched_priority);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq) {
+ activate_task(rq, p, 0);
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
+ }
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+
+ return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+ struct sched_param __user *, param)
+{
+ /* negative values for policy are not valid */
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy;
+ }
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ lp.sched_priority = p->rt_priority;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+ return retval;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+{
+ cpumask_t cpus_allowed;
+ cpumask_t new_mask = *in_mask;
+ struct task_struct *p;
+ int retval;
+
+ get_online_cpus();
+ read_lock(&tasklist_lock);
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ put_online_cpus();
+ return -ESRCH;
+ }
+
+ /*
+ * It is not safe to call set_cpus_allowed with the
+ * tasklist_lock held. We will bump the task_struct's
+ * usage count and then drop tasklist_lock.
+ */
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ retval = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;
+
+ retval = security_task_setscheduler(p, 0, NULL);
+ if (retval)
+ goto out_unlock;
+
+ cpuset_cpus_allowed(p, &cpus_allowed);
+ cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
+ retval = set_cpus_allowed_ptr(p, &new_mask);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, &cpus_allowed);
+ if (!cpus_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ new_mask = cpus_allowed;
+ goto again;
+ }
+ }
+out_unlock:
+ put_task_struct(p);
+ put_online_cpus();
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ cpumask_t *new_mask)
+{
+ if (len < sizeof(cpumask_t)) {
+ memset(new_mask, 0, sizeof(cpumask_t));
+ } else if (len > sizeof(cpumask_t)) {
+ len = sizeof(cpumask_t);
+ }
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_t new_mask;
+ int retval;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+ if (retval)
+ return retval;
+
+ return sched_setaffinity(pid, &new_mask);
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ get_online_cpus();
+ read_lock(&tasklist_lock);
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ put_online_cpus();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_t mask;
+
+ if (len < sizeof(cpumask_t))
+ return -EINVAL;
+
+ ret = sched_getaffinity(pid, &mask);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
+ return -EFAULT;
+
+ return sizeof(cpumask_t);
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ struct rq *rq = this_rq_lock();
+
+ schedstat_inc(rq, yld_count);
+ current->sched_class->yield_task(rq);
+
+ /*
+ * Since we are going to call schedule() anyway, there's
+ * no need to preempt or enable interrupts:
+ */
+ __release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ _raw_spin_unlock(&rq->lock);
+ preempt_enable_no_resched();
+
+ schedule();
+
+ return 0;
+}
+
+static void __cond_resched(void)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ __might_sleep(__FILE__, __LINE__);
+#endif
+ /*
+ * The BKS might be reacquired before we have dropped
+ * PREEMPT_ACTIVE, which could trigger a second
+ * cond_resched() call.
+ */
+ do {
+ add_preempt_count(PREEMPT_ACTIVE);
+ schedule();
+ sub_preempt_count(PREEMPT_ACTIVE);
+ } while (need_resched());
+}
+
+int __sched _cond_resched(void)
+{
+ if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
+ system_state == SYSTEM_RUNNING) {
+ __cond_resched();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t *lock)
+{
+ int resched = need_resched() && system_state == SYSTEM_RUNNING;
+ int ret = 0;
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched && need_resched())
+ __cond_resched();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(cond_resched_lock);
+
+int __sched cond_resched_softirq(void)
+{
+ BUG_ON(!in_softirq());
+
+ if (need_resched() && system_state == SYSTEM_RUNNING) {
+ local_bh_enable();
+ __cond_resched();
+ local_bh_disable();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * This is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+
+ delayacct_blkio_start();
+ atomic_inc(&rq->nr_iowait);
+ schedule();
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+ long ret;
+
+ delayacct_blkio_start();
+ atomic_inc(&rq->nr_iowait);
+ ret = schedule_timeout(timeout);
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ int retval;
+ struct timespec t;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ /*
+ * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
+ * tasks that are on an otherwise idle runqueue:
+ */
+ time_slice = 0;
+ if (p->policy == SCHED_RR) {
+ time_slice = DEF_TIMESLICE;
+ } else if (p->policy != SCHED_FIFO) {
+ struct sched_entity *se = &p->se;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ if (rq->cfs.load.weight)
+ time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ task_rq_unlock(rq, &flags);
+ }
+ read_unlock(&tasklist_lock);
+ jiffies_to_timespec(time_slice, &t);
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ return retval;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ unsigned state;
+
+ state = p->state ? __ffs(p->state) + 1 : 0;
+ printk(KERN_INFO "%-13.13s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running ");
+ else
+ printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+ else
+ printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ {
+ unsigned long *n = end_of_stack(p);
+ while (!*n)
+ n++;
+ free = (unsigned long)n - (unsigned long)end_of_stack(p);
+ }
+#endif
+ printk(KERN_CONT "%5lu %5d %6d\n", free,
+ task_pid_nr(p), task_pid_nr(p->real_parent));
+
+ show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#else
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#endif
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take alot of time:
+ */
+ touch_nmi_watchdog();
+ if (!state_filter || (p->state & state_filter))
+ sched_show_task(p);
+ } while_each_thread(g, p);
+
+ touch_all_softlockup_watchdogs();
+
+#ifdef CONFIG_SCHED_DEBUG
+ sysrq_sched_debug_show();
+#endif
+ read_unlock(&tasklist_lock);
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (state_filter == -1)
+ debug_show_all_locks();
+}
+
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{
+ idle->sched_class = &idle_sched_class;
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __sched_fork(idle);
+ idle->se.exec_start = sched_clock();
+
+ idle->prio = idle->normal_prio = MAX_PRIO;
+ idle->cpus_allowed = cpumask_of_cpu(cpu);
+ __set_task_cpu(idle, cpu);
+
+ rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ idle->oncpu = 1;
+#endif
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT)
+ task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
+#else
+ task_thread_info(idle)->preempt_count = 0;
+#endif
+ /*
+ * The idle tasks have their own, simple scheduling class:
+ */
+ idle->sched_class = &idle_sched_class;
+}
+
+/*
+ * In a system that switches off the HZ timer nohz_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer nohz_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+ unsigned int factor = 1 + ilog2(num_online_cpus());
+ const unsigned long limit = 200000000;
+
+ sysctl_sched_min_granularity *= factor;
+ if (sysctl_sched_min_granularity > limit)
+ sysctl_sched_min_granularity = limit;
+
+ sysctl_sched_latency *= factor;
+ if (sysctl_sched_latency > limit)
+ sysctl_sched_latency = limit;
+
+ sysctl_sched_wakeup_granularity *= factor;
+
+ sysctl_sched_shares_ratelimit *= factor;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we queue a struct migration_req structure in the source CPU's
+ * runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ * thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ * task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+{
+ struct migration_req req;
+ unsigned long flags;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+ if (!cpus_intersects(*new_mask, cpu_online_map)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+ !cpus_equal(p->cpus_allowed, *new_mask))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ p->cpus_allowed = *new_mask;
+ p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+ }
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpu_isset(task_cpu(p), *new_mask))
+ goto out;
+
+ if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, &flags);
+ wake_up_process(rq->migration_thread);
+ wait_for_completion(&req.done);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ }
+out:
+ task_rq_unlock(rq, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ struct rq *rq_dest, *rq_src;
+ int ret = 0, on_rq;
+
+ if (unlikely(!cpu_active(dest_cpu)))
+ return ret;
+
+ rq_src = cpu_rq(src_cpu);
+ rq_dest = cpu_rq(dest_cpu);
+
+ double_rq_lock(rq_src, rq_dest);
+ /* Already moved. */
+ if (task_cpu(p) != src_cpu)
+ goto done;
+ /* Affinity changed (again). */
+ if (!cpu_isset(dest_cpu, p->cpus_allowed))
+ goto fail;
+
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ deactivate_task(rq_src, p, 0);
+
+ set_task_cpu(p, dest_cpu);
+ if (on_rq) {
+ activate_task(rq_dest, p, 0);
+ check_preempt_curr(rq_dest, p, 0);
+ }
+done:
+ ret = 1;
+fail:
+ double_rq_unlock(rq_src, rq_dest);
+ return ret;
+}
+
+/*
+ * migration_thread - this is a highprio system thread that performs
+ * thread migration by bumping thread off CPU then 'pushing' onto
+ * another runqueue.
+ */
+static int migration_thread(void *data)
+{
+ int cpu = (long)data;
+ struct rq *rq;
+
+ rq = cpu_rq(cpu);
+ BUG_ON(rq->migration_thread != current);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ struct migration_req *req;
+ struct list_head *head;
+
+ spin_lock_irq(&rq->lock);
+
+ if (cpu_is_offline(cpu)) {
+ spin_unlock_irq(&rq->lock);
+ goto wait_to_die;
+ }
+
+ if (rq->active_balance) {
+ active_load_balance(rq, cpu);
+ rq->active_balance = 0;
+ }
+
+ head = &rq->migration_queue;
+
+ if (list_empty(head)) {
+ spin_unlock_irq(&rq->lock);
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ continue;
+ }
+ req = list_entry(head->next, struct migration_req, list);
+ list_del_init(head->next);
+
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ local_irq_enable();
+
+ complete(&req->done);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+
+wait_to_die:
+ /* Wait for kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ int ret;
+
+ local_irq_disable();
+ ret = __migrate_task(p, src_cpu, dest_cpu);
+ local_irq_enable();
+ return ret;
+}
+
+/*
+ * Figure out where task on dead CPU should go, use force if necessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+{
+ unsigned long flags;
+ cpumask_t mask;
+ struct rq *rq;
+ int dest_cpu;
+
+ do {
+ /* On same node? */
+ mask = node_to_cpumask(cpu_to_node(dead_cpu));
+ cpus_and(mask, mask, p->cpus_allowed);
+ dest_cpu = any_online_cpu(mask);
+
+ /* On any allowed CPU? */
+ if (dest_cpu >= nr_cpu_ids)
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+
+ /* No more Mr. Nice Guy. */
+ if (dest_cpu >= nr_cpu_ids) {
+ cpumask_t cpus_allowed;
+
+ cpuset_cpus_allowed_locked(p, &cpus_allowed);
+ /*
+ * Try to stay on the same cpuset, where the
+ * current cpuset may be a subset of all cpus.
+ * The cpuset_cpus_allowed_locked() variant of
+ * cpuset_cpus_allowed() will not block. It must be
+ * called within calls to cpuset_lock/cpuset_unlock.
+ */
+ rq = task_rq_lock(p, &flags);
+ p->cpus_allowed = cpus_allowed;
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, dead_cpu);
+ }
+ }
+ } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(struct rq *rq_src)
+{
+ struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ double_rq_lock(rq_src, rq_dest);
+ rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+ rq_src->nr_uninterruptible = 0;
+ double_rq_unlock(rq_src, rq_dest);
+ local_irq_restore(flags);
+}
+
+/* Run through task list and migrate tasks from the dead cpu. */
+static void migrate_live_tasks(int src_cpu)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+
+ do_each_thread(t, p) {
+ if (p == current)
+ continue;
+
+ if (task_cpu(p) == src_cpu)
+ move_task_off_dead_cpu(src_cpu, p);
+ } while_each_thread(t, p);
+
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible.
+ * Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+ int this_cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(this_cpu);
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(this_cpu));
+
+ /*
+ * Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on the current cpu.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
+
+ update_rq_clock(rq);
+ activate_task(rq, p, 0);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm)
+ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
+}
+
+/* called under rq->lock with disabled interrupts */
+static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+{
+ struct rq *rq = cpu_rq(dead_cpu);
+
+ /* Must be exiting, otherwise would be on tasklist. */
+ BUG_ON(!p->exit_state);
+
+ /* Cannot have done final schedule yet: would have vanished. */
+ BUG_ON(p->state == TASK_DEAD);
+
+ get_task_struct(p);
+
+ /*
+ * Drop lock around migration; if someone else moves it,
+ * that's OK. No task can be added to this CPU, so iteration is
+ * fine.
+ */
+ spin_unlock_irq(&rq->lock);
+ move_task_off_dead_cpu(dead_cpu, p);
+ spin_lock_irq(&rq->lock);
+
+ put_task_struct(p);
+}
+
+/* release_task() removes task from tasklist, so we won't find dead tasks. */
+static void migrate_dead_tasks(unsigned int dead_cpu)
+{
+ struct rq *rq = cpu_rq(dead_cpu);
+ struct task_struct *next;
+
+ for ( ; ; ) {
+ if (!rq->nr_running)
+ break;
+ update_rq_clock(rq);
+ next = pick_next_task(rq, rq->curr);
+ if (!next)
+ break;
+ next->sched_class->put_prev_task(rq, next);
+ migrate_dead(dead_cpu, next);
+
+ }
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {0, },
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {0, },
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ mode_t mode, proc_handler *proc_handler)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(13);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[11], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[12] is terminator */
+
+ return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_online_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_online_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+ if (sd_sysctl_header)
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ const struct sched_class *class;
+
+ cpu_set(rq->cpu, rq->rd->online);
+ rq->online = 1;
+
+ for_each_class(class) {
+ if (class->rq_online)
+ class->rq_online(rq);
+ }
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ const struct sched_class *class;
+
+ for_each_class(class) {
+ if (class->rq_offline)
+ class->rq_offline(rq);
+ }
+
+ cpu_clear(rq->cpu, rq->rd->online);
+ rq->online = 0;
+ }
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ struct task_struct *p;
+ int cpu = (long)hcpu;
+ unsigned long flags;
+ struct rq *rq;
+
+ switch (action) {
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
+ if (IS_ERR(p))
+ return NOTIFY_BAD;
+ kthread_bind(p, cpu);
+ /* Must be high prio: stop_machine expects to yield to it. */
+ rq = task_rq_lock(p, &flags);
+ __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
+ task_rq_unlock(rq, &flags);
+ cpu_rq(cpu)->migration_thread = p;
+ break;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ /* Strictly unnecessary, as first user will wake it. */
+ wake_up_process(cpu_rq(cpu)->migration_thread);
+
+ /* Update our root-domain */
+ rq = cpu_rq(cpu);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpu_isset(cpu, rq->rd->span));
+
+ set_rq_online(rq);
+ }
+ spin_unlock_irqrestore(&rq->lock, flags);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!cpu_rq(cpu)->migration_thread)
+ break;
+ /* Unbind it from offline cpu so it can run. Fall thru. */
+ kthread_bind(cpu_rq(cpu)->migration_thread,
+ any_online_cpu(cpu_online_map));
+ kthread_stop(cpu_rq(cpu)->migration_thread);
+ cpu_rq(cpu)->migration_thread = NULL;
+ break;
+
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
+ migrate_live_tasks(cpu);
+ rq = cpu_rq(cpu);
+ kthread_stop(rq->migration_thread);
+ rq->migration_thread = NULL;
+ /* Idle task back to normal (off runqueue, low prio) */
+ spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ deactivate_task(rq, rq->idle, 0);
+ rq->idle->static_prio = MAX_PRIO;
+ __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
+ rq->idle->sched_class = &idle_sched_class;
+ migrate_dead_tasks(cpu);
+ spin_unlock_irq(&rq->lock);
+ cpuset_unlock();
+ migrate_nr_uninterruptible(rq);
+ BUG_ON(rq->nr_running != 0);
+
+ /*
+ * No need to migrate the tasks: it was best-effort if
+ * they didn't take sched_hotcpu_mutex. Just wake up
+ * the requestors.
+ */
+ spin_lock_irq(&rq->lock);
+ while (!list_empty(&rq->migration_queue)) {
+ struct migration_req *req;
+
+ req = list_entry(rq->migration_queue.next,
+ struct migration_req, list);
+ list_del_init(&req->list);
+ spin_unlock_irq(&rq->lock);
+ complete(&req->done);
+ spin_lock_irq(&rq->lock);
+ }
+ spin_unlock_irq(&rq->lock);
+ break;
+
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ /* Update our root-domain */
+ rq = cpu_rq(cpu);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ spin_unlock_irqrestore(&rq->lock, flags);
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __cpuinitdata migration_notifier = {
+ .notifier_call = migration_call,
+ .priority = 10
+};
+
+static int __init migration_init(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err;
+
+ /* Start one for the boot CPU: */
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
+ migration_call(&migration_notifier, CPU_ONLINE, cpu);
+ register_cpu_notifier(&migration_notifier);
+
+ return err;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+ switch (lvl) {
+ case SD_LV_NONE:
+ return "NONE";
+ case SD_LV_SIBLING:
+ return "SIBLING";
+ case SD_LV_MC:
+ return "MC";
+ case SD_LV_CPU:
+ return "CPU";
+ case SD_LV_NODE:
+ return "NODE";
+ case SD_LV_ALLNODES:
+ return "ALLNODES";
+ case SD_LV_MAX:
+ return "MAX";
+
+ }
+ return "MAX";
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ cpumask_t *groupmask)
+{
+ struct sched_group *group = sd->groups;
+ char str[256];
+
+ cpulist_scnprintf(str, sizeof(str), sd->span);
+ cpus_clear(*groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %s level %s\n",
+ str, sd_level_to_string(sd->level));
+
+ if (!cpu_isset(cpu, sd->span)) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+ if (!cpu_isset(cpu, group->cpumask)) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (!group->__cpu_power) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: domain->cpu_power not "
+ "set\n");
+ break;
+ }
+
+ if (!cpus_weight(group->cpumask)) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (cpus_intersects(*groupmask, group->cpumask)) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpus_or(*groupmask, *groupmask, group->cpumask);
+
+ cpulist_scnprintf(str, sizeof(str), group->cpumask);
+ printk(KERN_CONT " %s", str);
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpus_equal(sd->span, *groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ cpumask_t *groupmask;
+ int level = 0;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!groupmask) {
+ printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
+ return;
+ }
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+ kfree(groupmask);
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpus_weight(sd->span) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_IDLE |
+ SD_WAKE_AFFINE |
+ SD_WAKE_BALANCE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpus_equal(sd->span, parent->span))
+ return 0;
+
+ /* Does parent contain flags not in child? */
+ /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+ if (cflags & SD_WAKE_AFFINE)
+ pflags &= ~SD_WAKE_BALANCE;
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES);
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ struct root_domain *old_rd = rq->rd;
+
+ if (cpu_isset(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpu_clear(rq->cpu, old_rd->span);
+
+ if (atomic_dec_and_test(&old_rd->refcount))
+ kfree(old_rd);
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpu_set(rq->cpu, rd->span);
+ if (cpu_isset(rq->cpu, cpu_online_map))
+ set_rq_online(rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ cpus_clear(rd->span);
+ cpus_clear(rd->online);
+
+ cpupri_init(&rd->cpupri);
+}
+
+static void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ init_rootdomain(rd);
+
+ return rd;
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ sd = sd->parent;
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ rcu_assign_pointer(rq->sd, sd);
+}
+
+/* cpus with isolated domains */
+static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ static int __initdata ints[NR_CPUS];
+ int i;
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ cpus_clear(cpu_isolated_map);
+ for (i = 1; i <= ints[0]; i++)
+ if (ints[i] < NR_CPUS)
+ cpu_set(ints[i], cpu_isolated_map);
+ return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+/*
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
+ *
+ * init_sched_build_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ */
+static void
+init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
+ int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg,
+ cpumask_t *tmpmask),
+ cpumask_t *covered, cpumask_t *tmpmask)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ int i;
+
+ cpus_clear(*covered);
+
+ for_each_cpu_mask_nr(i, *span) {
+ struct sched_group *sg;
+ int group = group_fn(i, cpu_map, &sg, tmpmask);
+ int j;
+
+ if (cpu_isset(i, *covered))
+ continue;
+
+ cpus_clear(sg->cpumask);
+ sg->__cpu_power = 0;
+
+ for_each_cpu_mask_nr(j, *span) {
+ if (group_fn(j, cpu_map, NULL, tmpmask) != group)
+ continue;
+
+ cpu_set(j, *covered);
+ cpu_set(j, sg->cpumask);
+ }
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+}
+
+#define SD_NODES_PER_DOMAIN 16
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, nodemask_t *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ /* Start at @node */
+ n = (node + i) % nr_node_ids;
+
+ if (!nr_cpus_node(n))
+ continue;
+
+ /* Skip already used nodes */
+ if (node_isset(n, *used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ node_set(best_node, *used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @span: resulting cpumask
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static void sched_domain_node_span(int node, cpumask_t *span)
+{
+ nodemask_t used_nodes;
+ node_to_cpumask_ptr(nodemask, node);
+ int i;
+
+ cpus_clear(*span);
+ nodes_clear(used_nodes);
+
+ cpus_or(*span, *span, *nodemask);
+ node_set(node, used_nodes);
+
+ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+ int next_node = find_next_best_node(node, &used_nodes);
+
+ node_to_cpumask_ptr_next(nodemask, next_node);
+ cpus_or(*span, *span, *nodemask);
+ }
+}
+#endif /* CONFIG_NUMA */
+
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
+/*
+ * SMT sched-domains:
+ */
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+
+static int
+cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+ cpumask_t *unused)
+{
+ if (sg)
+ *sg = &per_cpu(sched_group_cpus, cpu);
+ return cpu;
+}
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * multi-core sched-domains:
+ */
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+#endif /* CONFIG_SCHED_MC */
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int
+cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+ cpumask_t *mask)
+{
+ int group;
+
+ *mask = per_cpu(cpu_sibling_map, cpu);
+ cpus_and(*mask, *mask, *cpu_map);
+ group = first_cpu(*mask);
+ if (sg)
+ *sg = &per_cpu(sched_group_core, group);
+ return group;
+}
+#elif defined(CONFIG_SCHED_MC)
+static int
+cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+ cpumask_t *unused)
+{
+ if (sg)
+ *sg = &per_cpu(sched_group_core, cpu);
+ return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+
+static int
+cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+ cpumask_t *mask)
+{
+ int group;
+#ifdef CONFIG_SCHED_MC
+ *mask = cpu_coregroup_map(cpu);
+ cpus_and(*mask, *mask, *cpu_map);
+ group = first_cpu(*mask);
+#elif defined(CONFIG_SCHED_SMT)
+ *mask = per_cpu(cpu_sibling_map, cpu);
+ cpus_and(*mask, *mask, *cpu_map);
+ group = first_cpu(*mask);
+#else
+ group = cpu;
+#endif
+ if (sg)
+ *sg = &per_cpu(sched_group_phys, group);
+ return group;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group ***sched_group_nodes_bycpu;
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg, cpumask_t *nodemask)
+{
+ int group;
+
+ *nodemask = node_to_cpumask(cpu_to_node(cpu));
+ cpus_and(*nodemask, *nodemask, *cpu_map);
+ group = first_cpu(*nodemask);
+
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group);
+ return group;
+}
+
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+ struct sched_group *sg = group_head;
+ int j;
+
+ if (!sg)
+ return;
+ do {
+ for_each_cpu_mask_nr(j, sg->cpumask) {
+ struct sched_domain *sd;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+ }
+ sg = sg->next;
+ } while (sg != group_head);
+}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_NUMA
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+{
+ int cpu, i;
+
+ for_each_cpu_mask_nr(cpu, *cpu_map) {
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ *nodemask = node_to_cpumask(i);
+ cpus_and(*nodemask, *nodemask, *cpu_map);
+ if (cpus_empty(*nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+}
+#else /* !CONFIG_NUMA */
+static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+{
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+ struct sched_domain *child;
+ struct sched_group *group;
+
+ WARN_ON(!sd || !sd->groups);
+
+ if (cpu != first_cpu(sd->groups->cpumask))
+ return;
+
+ child = sd->child;
+
+ sd->groups->__cpu_power = 0;
+
+ /*
+ * For perf policy, if the groups in child domain share resources
+ * (for example cores sharing some portions of the cache hierarchy
+ * or SMT), then set this domain groups cpu_power such that each group
+ * can handle only one task, when there are other idle groups in the
+ * same sched domain.
+ */
+ if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+ (child->flags &
+ (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+ return;
+ }
+
+ /*
+ * add cpu_power of each child group to this groups cpu_power
+ */
+ group = child->groups;
+ do {
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
+ group = group->next;
+ } while (group != child->groups);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type) sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type) do { } while (0)
+#endif
+
+#define SD_INIT(sd, type) sd_init_##type(sd)
+
+#define SD_INIT_FUNC(type) \
+static noinline void sd_init_##type(struct sched_domain *sd) \
+{ \
+ memset(sd, 0, sizeof(*sd)); \
+ *sd = SD_##type##_INIT; \
+ sd->level = SD_LV_##type; \
+ SD_INIT_NAME(sd, type); \
+}
+
+SD_INIT_FUNC(CPU)
+#ifdef CONFIG_NUMA
+ SD_INIT_FUNC(ALLNODES)
+ SD_INIT_FUNC(NODE)
+#endif
+#ifdef CONFIG_SCHED_SMT
+ SD_INIT_FUNC(SIBLING)
+#endif
+#ifdef CONFIG_SCHED_MC
+ SD_INIT_FUNC(MC)
+#endif
+
+/*
+ * To minimize stack usage kmalloc room for cpumasks and share the
+ * space as the usage in build_sched_domains() dictates. Used only
+ * if the amount of space is significant.
+ */
+struct allmasks {
+ cpumask_t tmpmask; /* make this one first */
+ union {
+ cpumask_t nodemask;
+ cpumask_t this_sibling_map;
+ cpumask_t this_core_map;
+ };
+ cpumask_t send_covered;
+
+#ifdef CONFIG_NUMA
+ cpumask_t domainspan;
+ cpumask_t covered;
+ cpumask_t notcovered;
+#endif
+};
+
+#if NR_CPUS > 128
+#define SCHED_CPUMASK_ALLOC 1
+#define SCHED_CPUMASK_FREE(v) kfree(v)
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
+#else
+#define SCHED_CPUMASK_ALLOC 0
+#define SCHED_CPUMASK_FREE(v)
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
+#endif
+
+#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
+ ((unsigned long)(a) + offsetof(struct allmasks, v))
+
+static int default_relax_domain_level = -1;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ unsigned long val;
+
+ val = simple_strtoul(str, NULL, 0);
+ if (val < SD_LV_MAX)
+ default_relax_domain_level = val;
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* turn off idle balance on this domain */
+ sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* turn on idle balance on this domain */
+ sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+ }
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int __build_sched_domains(const cpumask_t *cpu_map,
+ struct sched_domain_attr *attr)
+{
+ int i;
+ struct root_domain *rd;
+ SCHED_CPUMASK_DECLARE(allmasks);
+ cpumask_t *tmpmask;
+#ifdef CONFIG_NUMA
+ struct sched_group **sched_group_nodes = NULL;
+ int sd_allnodes = 0;
+
+ /*
+ * Allocate the per-node list of sched groups
+ */
+ sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
+ GFP_KERNEL);
+ if (!sched_group_nodes) {
+ printk(KERN_WARNING "Can not alloc sched group node list\n");
+ return -ENOMEM;
+ }
+#endif
+
+ rd = alloc_rootdomain();
+ if (!rd) {
+ printk(KERN_WARNING "Cannot alloc root domain\n");
+#ifdef CONFIG_NUMA
+ kfree(sched_group_nodes);
+#endif
+ return -ENOMEM;
+ }
+
+#if SCHED_CPUMASK_ALLOC
+ /* get space for all scratch cpumask variables */
+ allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+ if (!allmasks) {
+ printk(KERN_WARNING "Cannot alloc cpumask array\n");
+ kfree(rd);
+#ifdef CONFIG_NUMA
+ kfree(sched_group_nodes);
+#endif
+ return -ENOMEM;
+ }
+#endif
+ tmpmask = (cpumask_t *)allmasks;
+
+
+#ifdef CONFIG_NUMA
+ sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
+
+ /*
+ * Set up domains for cpus specified by the cpu_map.
+ */
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ struct sched_domain *sd = NULL, *p;
+ SCHED_CPUMASK_VAR(nodemask, allmasks);
+
+ *nodemask = node_to_cpumask(cpu_to_node(i));
+ cpus_and(*nodemask, *nodemask, *cpu_map);
+
+#ifdef CONFIG_NUMA
+ if (cpus_weight(*cpu_map) >
+ SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+ sd = &per_cpu(allnodes_domains, i);
+ SD_INIT(sd, ALLNODES);
+ set_domain_attribute(sd, attr);
+ sd->span = *cpu_map;
+ cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
+ p = sd;
+ sd_allnodes = 1;
+ } else
+ p = NULL;
+
+ sd = &per_cpu(node_domains, i);
+ SD_INIT(sd, NODE);
+ set_domain_attribute(sd, attr);
+ sched_domain_node_span(cpu_to_node(i), &sd->span);
+ sd->parent = p;
+ if (p)
+ p->child = sd;
+ cpus_and(sd->span, sd->span, *cpu_map);
+#endif
+
+ p = sd;
+ sd = &per_cpu(phys_domains, i);
+ SD_INIT(sd, CPU);
+ set_domain_attribute(sd, attr);
+ sd->span = *nodemask;
+ sd->parent = p;
+ if (p)
+ p->child = sd;
+ cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+
+#ifdef CONFIG_SCHED_MC
+ p = sd;
+ sd = &per_cpu(core_domains, i);
+ SD_INIT(sd, MC);
+ set_domain_attribute(sd, attr);
+ sd->span = cpu_coregroup_map(i);
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ p->child = sd;
+ cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+ p = sd;
+ sd = &per_cpu(cpu_domains, i);
+ SD_INIT(sd, SIBLING);
+ set_domain_attribute(sd, attr);
+ sd->span = per_cpu(cpu_sibling_map, i);
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ p->child = sd;
+ cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+#endif
+ }
+
+#ifdef CONFIG_SCHED_SMT
+ /* Set up CPU (sibling) groups */
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
+ SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+ *this_sibling_map = per_cpu(cpu_sibling_map, i);
+ cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
+ if (i != first_cpu(*this_sibling_map))
+ continue;
+
+ init_sched_build_groups(this_sibling_map, cpu_map,
+ &cpu_to_cpu_group,
+ send_covered, tmpmask);
+ }
+#endif
+
+#ifdef CONFIG_SCHED_MC
+ /* Set up multi-core groups */
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ SCHED_CPUMASK_VAR(this_core_map, allmasks);
+ SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+ *this_core_map = cpu_coregroup_map(i);
+ cpus_and(*this_core_map, *this_core_map, *cpu_map);
+ if (i != first_cpu(*this_core_map))
+ continue;
+
+ init_sched_build_groups(this_core_map, cpu_map,
+ &cpu_to_core_group,
+ send_covered, tmpmask);
+ }
+#endif
+
+ /* Set up physical groups */
+ for (i = 0; i < nr_node_ids; i++) {
+ SCHED_CPUMASK_VAR(nodemask, allmasks);
+ SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+ *nodemask = node_to_cpumask(i);
+ cpus_and(*nodemask, *nodemask, *cpu_map);
+ if (cpus_empty(*nodemask))
+ continue;
+
+ init_sched_build_groups(nodemask, cpu_map,
+ &cpu_to_phys_group,
+ send_covered, tmpmask);
+ }
+
+#ifdef CONFIG_NUMA
+ /* Set up node groups */
+ if (sd_allnodes) {
+ SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+ init_sched_build_groups(cpu_map, cpu_map,
+ &cpu_to_allnodes_group,
+ send_covered, tmpmask);
+ }
+
+ for (i = 0; i < nr_node_ids; i++) {
+ /* Set up node groups */
+ struct sched_group *sg, *prev;
+ SCHED_CPUMASK_VAR(nodemask, allmasks);
+ SCHED_CPUMASK_VAR(domainspan, allmasks);
+ SCHED_CPUMASK_VAR(covered, allmasks);
+ int j;
+
+ *nodemask = node_to_cpumask(i);
+ cpus_clear(*covered);
+
+ cpus_and(*nodemask, *nodemask, *cpu_map);
+ if (cpus_empty(*nodemask)) {
+ sched_group_nodes[i] = NULL;
+ continue;
+ }
+
+ sched_domain_node_span(i, domainspan);
+ cpus_and(*domainspan, *domainspan, *cpu_map);
+
+ sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING "Can not alloc domain group for "
+ "node %d\n", i);
+ goto error;
+ }
+ sched_group_nodes[i] = sg;
+ for_each_cpu_mask_nr(j, *nodemask) {
+ struct sched_domain *sd;
+
+ sd = &per_cpu(node_domains, j);
+ sd->groups = sg;
+ }
+ sg->__cpu_power = 0;
+ sg->cpumask = *nodemask;
+ sg->next = sg;
+ cpus_or(*covered, *covered, *nodemask);
+ prev = sg;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ SCHED_CPUMASK_VAR(notcovered, allmasks);
+ int n = (i + j) % nr_node_ids;
+ node_to_cpumask_ptr(pnodemask, n);
+
+ cpus_complement(*notcovered, *covered);
+ cpus_and(*tmpmask, *notcovered, *cpu_map);
+ cpus_and(*tmpmask, *tmpmask, *domainspan);
+ if (cpus_empty(*tmpmask))
+ break;
+
+ cpus_and(*tmpmask, *tmpmask, *pnodemask);
+ if (cpus_empty(*tmpmask))
+ continue;
+
+ sg = kmalloc_node(sizeof(struct sched_group),
+ GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ goto error;
+ }
+ sg->__cpu_power = 0;
+ sg->cpumask = *tmpmask;
+ sg->next = prev->next;
+ cpus_or(*covered, *covered, *tmpmask);
+ prev->next = sg;
+ prev = sg;
+ }
+ }
+#endif
+
+ /* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ struct sched_domain *sd = &per_cpu(cpu_domains, i);
+
+ init_sched_groups_power(i, sd);
+ }
+#endif
+#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ struct sched_domain *sd = &per_cpu(core_domains, i);
+
+ init_sched_groups_power(i, sd);
+ }
+#endif
+
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ struct sched_domain *sd = &per_cpu(phys_domains, i);
+
+ init_sched_groups_power(i, sd);
+ }
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < nr_node_ids; i++)
+ init_numa_sched_groups_power(sched_group_nodes[i]);
+
+ if (sd_allnodes) {
+ struct sched_group *sg;
+
+ cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+ tmpmask);
+ init_numa_sched_groups_power(sg);
+ }
+#endif
+
+ /* Attach the domains */
+ for_each_cpu_mask_nr(i, *cpu_map) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+ sd = &per_cpu(core_domains, i);
+#else
+ sd = &per_cpu(phys_domains, i);
+#endif
+ cpu_attach_domain(sd, rd, i);
+ }
+
+ SCHED_CPUMASK_FREE((void *)allmasks);
+ return 0;
+
+#ifdef CONFIG_NUMA
+error:
+ free_sched_groups(cpu_map, tmpmask);
+ SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
+ return -ENOMEM;
+#endif
+}
+
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+ return __build_sched_domains(cpu_map, NULL);
+}
+
+static cpumask_t *doms_cur; /* current sched domains */
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask_t) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask_t fallback_doms.
+ */
+static cpumask_t fallback_doms;
+
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+ dattr_cur = NULL;
+ err = build_sched_domains(doms_cur);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
+ cpumask_t *tmpmask)
+{
+ free_sched_groups(cpu_map, tmpmask);
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+ cpumask_t tmpmask;
+ int i;
+
+ unregister_sched_domain_sysctl();
+
+ for_each_cpu_mask_nr(i, *cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ synchronize_sched();
+ arch_destroy_sched_domains(cpu_map, &tmpmask);
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* fast path */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be kmalloc'd. This routine takes
+ * ownership of it and will kfree it when done with it. If the caller
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n; j++) {
+ if (cpus_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* no match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur + i);
+match1:
+ ;
+ }
+
+ if (doms_new == NULL) {
+ ndoms_cur = 0;
+ doms_new = &fallback_doms;
+ cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+ dattr_new = NULL;
+ }
+
+ /* Build new domains */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < ndoms_cur; j++) {
+ if (cpus_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* no match - add a new doms_new */
+ __build_sched_domains(doms_new + i,
+ dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains */
+ if (doms_cur != &fallback_doms)
+ kfree(doms_cur);
+ kfree(dattr_cur); /* kfree(NULL) is safe */
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+ get_online_cpus();
+
+ /* Destroy domains first to force the rebuild */
+ partition_sched_domains(0, NULL, NULL);
+
+ rebuild_sched_domains();
+ put_online_cpus();
+
+ return 0;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+ int ret;
+
+ if (buf[0] != '0' && buf[0] != '1')
+ return -EINVAL;
+
+ if (smt)
+ sched_smt_power_savings = (buf[0] == '1');
+ else
+ sched_mc_power_savings = (buf[0] == '1');
+
+ ret = arch_reinit_sched_domains();
+
+ return ret ? ret : count;
+}
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ char *page)
+{
+ return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 0);
+}
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+ sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ char *page)
+{
+ return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 1);
+}
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+ sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+#ifndef CONFIG_CPUSETS
+/*
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ partition_sched_domains(1, NULL, NULL);
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (int)(long)hcpu;
+
+ switch (action) {
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ disable_runtime(cpu_rq(cpu));
+ return NOTIFY_OK;
+
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ enable_runtime(cpu_rq(cpu));
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+void __init sched_init_smp(void)
+{
+ cpumask_t non_isolated_cpus;
+
+#if defined(CONFIG_NUMA)
+ sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
+ GFP_KERNEL);
+ BUG_ON(sched_group_nodes_bycpu == NULL);
+#endif
+ get_online_cpus();
+ mutex_lock(&sched_domains_mutex);
+ arch_init_sched_domains(&cpu_online_map);
+ cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
+ if (cpus_empty(non_isolated_cpus))
+ cpu_set(smp_processor_id(), non_isolated_cpus);
+ mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
+ /* XXX: Theoretical race here - CPU may be hotplugged now */
+ hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+ /* RT runtime code needs to handle some hotplug events */
+ hotcpu_notifier(update_runtime, 0);
+
+ init_hrtick();
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+ BUG();
+ sched_init_granularity();
+}
+#else
+void __init sched_init_smp(void)
+{
+ sched_init_granularity();
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+{
+ cfs_rq->tasks_timeline = RB_ROOT;
+ INIT_LIST_HEAD(&cfs_rq->tasks);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->rq = rq;
+#endif
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+}
+
+static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->overloaded = 0;
+#endif
+
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ spin_lock_init(&rt_rq->rt_runtime_lock);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
+#endif
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu, int add,
+ struct sched_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+ tg->cfs_rq[cpu] = cfs_rq;
+ init_cfs_rq(cfs_rq, rq);
+ cfs_rq->tg = tg;
+ if (add)
+ list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+ tg->se[cpu] = se;
+ /* se could be NULL for init_task_group */
+ if (!se)
+ return;
+
+ if (!parent)
+ se->cfs_rq = &rq->cfs;
+ else
+ se->cfs_rq = parent->my_q;
+
+ se->my_q = cfs_rq;
+ se->load.weight = tg->shares;
+ se->load.inv_weight = 0;
+ se->parent = parent;
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu, int add,
+ struct sched_rt_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ tg->rt_rq[cpu] = rt_rq;
+ init_rt_rq(rt_rq, rq);
+ rt_rq->tg = tg;
+ rt_rq->rt_se = rt_se;
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ if (add)
+ list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+
+ tg->rt_se[cpu] = rt_se;
+ if (!rt_se)
+ return;
+
+ if (!parent)
+ rt_se->rt_rq = &rq->rt;
+ else
+ rt_se->rt_rq = parent->my_q;
+
+ rt_se->my_q = rt_rq;
+ rt_se->parent = parent;
+ INIT_LIST_HEAD(&rt_se->run_list);
+}
+#endif
+
+void __init sched_init(void)
+{
+ int i, j;
+ unsigned long alloc_size = 0, ptr;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_USER_SCHED
+ alloc_size *= 2;
+#endif
+ /*
+ * As sched_init() is called before page_alloc is setup,
+ * we use alloc_bootmem().
+ */
+ if (alloc_size) {
+ ptr = (unsigned long)alloc_bootmem(alloc_size);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ init_task_group.se = (struct sched_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+ root_task_group.se = (struct sched_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ init_task_group.rt_rq = (struct rt_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
+ }
+
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+#endif
+
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+#ifdef CONFIG_USER_SCHED
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
+ global_rt_period(), RUNTIME_INF);
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_GROUP_SCHED
+ list_add(&init_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&init_task_group.children);
+
+#ifdef CONFIG_USER_SCHED
+ INIT_LIST_HEAD(&root_task_group.children);
+ init_task_group.parent = &root_task_group;
+ list_add(&init_task_group.siblings, &root_task_group.children);
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
+
+ for_each_possible_cpu(i) {
+ struct rq *rq;
+
+ rq = cpu_rq(i);
+ spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
+ init_cfs_rq(&rq->cfs, rq);
+ init_rt_rq(&rq->rt, rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ init_task_group.shares = init_task_group_load;
+ INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+ /*
+ * How much cpu bandwidth does init_task_group get?
+ *
+ * In case of task-groups formed thr' the cgroup filesystem, it
+ * gets 100% of the cpu resources in the system. This overall
+ * system cpu resource is divided among the tasks of
+ * init_task_group and its child task-groups in a fair manner,
+ * based on each entity's (task or task-group's) weight
+ * (se->load.weight).
+ *
+ * In other words, if init_task_group has 10 tasks of weight
+ * 1024) and two child groups A0 and A1 (of weight 1024 each),
+ * then A0's share of the cpu resource is:
+ *
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ *
+ * We achieve this by letting init_task_group's tasks sit
+ * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+ */
+ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+#elif defined CONFIG_USER_SCHED
+ root_task_group.shares = NICE_0_LOAD;
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
+ /*
+ * In case of task-groups formed thr' the user id of tasks,
+ * init_task_group represents tasks belonging to root user.
+ * Hence it forms a sibling of all subsequent groups formed.
+ * In this case, init_task_group gets only a fraction of overall
+ * system cpu resource, based on the weight assigned to root
+ * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+ * by letting tasks of init_task_group sit in a separate cfs_rq
+ * (init_cfs_rq) and having one entity represent this group of
+ * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+ */
+ init_tg_cfs_entry(&init_task_group,
+ &per_cpu(init_cfs_rq, i),
+ &per_cpu(init_sched_entity, i), i, 1,
+ root_task_group.se[i]);
+
+#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#ifdef CONFIG_RT_GROUP_SCHED
+ INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+ init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
+#elif defined CONFIG_USER_SCHED
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
+ init_tg_rt_entry(&init_task_group,
+ &per_cpu(init_rt_rq, i),
+ &per_cpu(init_sched_rt_entity, i), i, 1,
+ root_task_group.rt_se[i]);
+#endif
+#endif
+
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
+#ifdef CONFIG_SMP
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->active_balance = 0;
+ rq->next_balance = jiffies;
+ rq->push_cpu = 0;
+ rq->cpu = i;
+ rq->online = 0;
+ rq->migration_thread = NULL;
+ INIT_LIST_HEAD(&rq->migration_queue);
+ rq_attach_root(rq, &def_root_domain);
+#endif
+ init_rq_hrtick(rq);
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+ set_load_weight(&init_task);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+ plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+ /*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ scheduler_running = 1;
+}
+
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+void __might_sleep(char *file, int line)
+{
+#ifdef in_atomic
+ static unsigned long prev_jiffy; /* ratelimiting */
+
+ if ((!in_atomic() && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void normalize_task(struct rq *rq, struct task_struct *p)
+{
+ int on_rq;
+
+ update_rq_clock(rq);
+ on_rq = p->se.on_rq;
+ if (on_rq)
+ deactivate_task(rq, p, 0);
+ __setscheduler(rq, p, SCHED_NORMAL, 0);
+ if (on_rq) {
+ activate_task(rq, p, 0);
+ resched_task(rq->curr);
+ }
+}
+
+void normalize_rt_tasks(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ struct rq *rq;
+
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, p) {
+ /*
+ * Only normalize user tasks:
+ */
+ if (!p->mm)
+ continue;
+
+ p->se.exec_start = 0;
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_start = 0;
+ p->se.sleep_start = 0;
+ p->se.block_start = 0;
+#endif
+
+ if (!rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
+ * tasks back to 0:
+ */
+ if (TASK_NICE(p) < 0 && p->mm)
+ set_user_nice(p, 0);
+ continue;
+ }
+
+ spin_lock(&p->pi_lock);
+ rq = __task_rq_lock(p);
+
+ normalize_task(rq, p);
+
+ __task_rq_unlock(rq);
+ spin_unlock(&p->pi_lock);
+ } while_each_thread(g, p);
+
+ read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+}
+
+static
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se, *parent_se;
+ struct rq *rq;
+ int i;
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ tg->shares = NICE_0_LOAD;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
+ cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kmalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!se)
+ goto err;
+
+ parent_se = parent ? parent->se[i] : NULL;
+ init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+ }
+
+ return 1;
+
+ err:
+ return 0;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+ &cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else /* !CONFG_FAIR_GROUP_SCHED */
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+
+static inline
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+static
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se, *parent_se;
+ struct rq *rq;
+ int i;
+
+ tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ init_rt_bandwidth(&tg->rt_bandwidth,
+ ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
+ rt_rq = kmalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!rt_rq)
+ goto err;
+
+ rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!rt_se)
+ goto err;
+
+ parent_se = parent ? parent->rt_se[i] : NULL;
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+ }
+
+ return 1;
+
+ err:
+ return 0;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+ &cpu_rq(cpu)->leaf_rt_rq_list);
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else /* !CONFIG_RT_GROUP_SCHED */
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_GROUP_SCHED
+static void free_sched_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+ unsigned long flags;
+ int i;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg, parent))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg, parent))
+ goto err;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i) {
+ register_fair_sched_group(tg, i);
+ register_rt_sched_group(tg, i);
+ }
+ list_add_rcu(&tg->list, &task_groups);
+
+ WARN_ON(!parent); /* root should already exist */
+
+ tg->parent = parent;
+ INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group_rcu(struct rcu_head *rhp)
+{
+ /* now it should be safe to free those cfs_rqs */
+ free_sched_group(container_of(rhp, struct task_group, rcu));
+}
+
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_group *tg)
+{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i) {
+ unregister_fair_sched_group(tg, i);
+ unregister_rt_sched_group(tg, i);
+ }
+ list_del_rcu(&tg->list);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+/* change task's runqueue when it moves between groups.
+ * The caller of this function should have put the task in its new group
+ * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ * reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int on_rq, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ update_rq_clock(rq);
+
+ running = task_current(rq, tsk);
+ on_rq = tsk->se.on_rq;
+
+ if (on_rq)
+ dequeue_task(rq, tsk, 0);
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
+
+ set_task_rq(tsk, task_cpu(tsk));
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->moved_group)
+ tsk->sched_class->moved_group(tsk);
+#endif
+
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, tsk, 0);
+
+ task_rq_unlock(rq, &flags);
+}
+#endif /* CONFIG_GROUP_SCHED */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ int on_rq;
+
+ on_rq = se->on_rq;
+ if (on_rq)
+ dequeue_entity(cfs_rq, se, 0);
+
+ se->load.weight = shares;
+ se->load.inv_weight = 0;
+
+ if (on_rq)
+ enqueue_entity(cfs_rq, se, 0);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __set_se_shares(se, shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+ unsigned long flags;
+
+ /*
+ * We can't change the weight of the root cgroup.
+ */
+ if (!tg->se[0])
+ return -EINVAL;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ else if (shares > MAX_SHARES)
+ shares = MAX_SHARES;
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+ goto done;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ unregister_fair_sched_group(tg, i);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+
+ /* wait for any ongoing reference to this group to finish */
+ synchronize_sched();
+
+ /*
+ * Now we are free to modify the group's share on each cpu
+ * w/o tripping rebalance_share or load_balance_fair.
+ */
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ /*
+ * force a rebalance
+ */
+ cfs_rq_set_shares(tg->cfs_rq[i], 0);
+ set_se_shares(tg->se[i], shares);
+ }
+
+ /*
+ * Enable load balance activity on this group, by inserting it back on
+ * each cpu's rq->leaf_cfs_rq_list.
+ */
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ register_fair_sched_group(tg, i);
+ list_add_rcu(&tg->siblings, &tg->parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+done:
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
+unsigned long sched_group_shares(struct task_group *tg)
+{
+ return tg->shares;
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 20;
+
+ return div64_u64(runtime << 20, period);
+}
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
+
+ return 0;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
+
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ return walk_tg_tree(tg_schedulable, tg_nop, &data);
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+
+ return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (rt_period == 0)
+ return -EINVAL;
+
+ return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+ u64 runtime, period;
+ int ret = 0;
+
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ runtime = global_rt_runtime();
+ period = global_rt_period();
+
+ /*
+ * Sanity check on the sysctl variables.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+ unsigned long flags;
+ int i;
+
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+ return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_constraints();
+ if (ret) {
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ } else {
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period =
+ ns_to_ktime(global_rt_period());
+ }
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/* return corresponding task_group object of a cgroup */
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
+ struct task_group, css);
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct task_group *tg, *parent;
+
+ if (!cgrp->parent) {
+ /* This is early initialization for the top cgroup */
+ return &init_task_group.css;
+ }
+
+ parent = cgroup_tg(cgrp->parent);
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+
+ return &tg->css;
+}
+
+static void
+cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_destroy_group(tg);
+}
+
+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ return -EINVAL;
+#else
+ /* We don't support RT-tasks being in separate groups */
+ if (tsk->sched_class != &fair_sched_class)
+ return -EINVAL;
+#endif
+
+ return 0;
+}
+
+static void
+cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cont, struct task_struct *tsk)
+{
+ sched_move_task(tsk);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+ u64 shareval)
+{
+ return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+}
+
+static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ return (u64) tg->shares;
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+ s64 val)
+{
+ return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+}
+
+static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return sched_group_rt_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+ u64 rt_period_us)
+{
+ return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+ return sched_group_rt_period(cgroup_tg(cgrp));
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "shares",
+ .read_u64 = cpu_shares_read_u64,
+ .write_u64 = cpu_shares_write_u64,
+ },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ {
+ .name = "rt_runtime_us",
+ .read_s64 = cpu_rt_runtime_read,
+ .write_s64 = cpu_rt_runtime_write,
+ },
+ {
+ .name = "rt_period_us",
+ .read_u64 = cpu_rt_period_read_uint,
+ .write_u64 = cpu_rt_period_write_uint,
+ },
+#endif
+};
+
+static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
+}
+
+struct cgroup_subsys cpu_cgroup_subsys = {
+ .name = "cpu",
+ .create = cpu_cgroup_create,
+ .destroy = cpu_cgroup_destroy,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .populate = cpu_cgroup_populate,
+ .subsys_id = cpu_cgroup_subsys_id,
+ .early_init = 1,
+};
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_CPUACCT
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* track cpu usage of a group of tasks */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 *cpuusage;
+};
+
+struct cgroup_subsys cpuacct_subsys;
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_create(
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+
+ if (!ca)
+ return ERR_PTR(-ENOMEM);
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage) {
+ kfree(ca);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &ca->css;
+}
+
+/* destroy an existing cpu accounting group */
+static void
+cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+
+ /*
+ * Take rq->lock to make 64-bit addition safe on 32-bit
+ * platforms.
+ */
+ spin_lock_irq(&cpu_rq(i)->lock);
+ totalcpuusage += *cpuusage;
+ spin_unlock_irq(&cpu_rq(i)->lock);
+ }
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+ u64 reset)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_possible_cpu(i) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+
+ spin_lock_irq(&cpu_rq(i)->lock);
+ *cpuusage = 0;
+ spin_unlock_irq(&cpu_rq(i)->lock);
+ }
+out:
+ return err;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+};
+
+static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
+}
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+
+ if (!cpuacct_subsys.active)
+ return;
+
+ ca = task_ca(tsk);
+ if (ca) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
+
+ *cpuusage += cputime;
+ }
+}
+
+struct cgroup_subsys cpuacct_subsys = {
+ .name = "cpuacct",
+ .create = cpuacct_create,
+ .destroy = cpuacct_destroy,
+ .populate = cpuacct_populate,
+ .subsys_id = cpuacct_subsys_id,
+};
+#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 0000000..a0b0852
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,266 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Updates and enhancements:
+ * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on code by:
+ * Ingo Molnar <mingo@redhat.com>
+ * Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ * - gtod
+ * - sched_clock()
+ * - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 2 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
+
+static __read_mostly int sched_clock_running;
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
+struct sched_clock_data {
+ /*
+ * Raw spinlock - this is a special case: this might be called
+ * from within instrumentation code so we dont want to do any
+ * instrumentation ourselves.
+ */
+ raw_spinlock_t lock;
+
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+ return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+ return &per_cpu(sched_clock_data, cpu);
+}
+
+void sched_clock_init(void)
+{
+ u64 ktime_now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct sched_clock_data *scd = cpu_sdc(cpu);
+
+ scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ scd->tick_raw = 0;
+ scd->tick_gtod = ktime_now;
+ scd->clock = ktime_now;
+ }
+
+ sched_clock_running = 1;
+}
+
+/*
+ * min,max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+ return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+ return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ * - filter out backward motion
+ * - use the GTOD tick value to create a window to filter crazy TSC values
+ */
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+ s64 delta = now - scd->tick_raw;
+ u64 clock, min_clock, max_clock;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ /*
+ * scd->clock = clamp(scd->tick_gtod + delta,
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
+ */
+
+ clock = scd->tick_gtod + delta;
+ min_clock = wrap_max(scd->tick_gtod, scd->clock);
+ max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+
+ clock = wrap_max(clock, min_clock);
+ clock = wrap_min(clock, max_clock);
+
+ scd->clock = clock;
+
+ return scd->clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+ struct sched_clock_data *data2)
+{
+ if (data1 < data2) {
+ __raw_spin_lock(&data1->lock);
+ __raw_spin_lock(&data2->lock);
+ } else {
+ __raw_spin_lock(&data2->lock);
+ __raw_spin_lock(&data1->lock);
+ }
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+ struct sched_clock_data *scd = cpu_sdc(cpu);
+ u64 now, clock, this_clock, remote_clock;
+
+ if (unlikely(!sched_clock_running))
+ return 0ull;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ now = sched_clock();
+
+ if (cpu != raw_smp_processor_id()) {
+ struct sched_clock_data *my_scd = this_scd();
+
+ lock_double_clock(scd, my_scd);
+
+ this_clock = __update_sched_clock(my_scd, now);
+ remote_clock = scd->clock;
+
+ /*
+ * Use the opportunity that we have both locks
+ * taken to couple the two clocks: we take the
+ * larger time as the latest time for both
+ * runqueues. (this creates monotonic movement)
+ */
+ if (likely((s64)(remote_clock - this_clock) < 0)) {
+ clock = this_clock;
+ scd->clock = clock;
+ } else {
+ /*
+ * Should be rare, but possible:
+ */
+ clock = remote_clock;
+ my_scd->clock = remote_clock;
+ }
+
+ __raw_spin_unlock(&my_scd->lock);
+ } else {
+ __raw_spin_lock(&scd->lock);
+ clock = __update_sched_clock(scd, now);
+ }
+
+ __raw_spin_unlock(&scd->lock);
+
+ return clock;
+}
+
+void sched_clock_tick(void)
+{
+ struct sched_clock_data *scd = this_scd();
+ u64 now, now_gtod;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ now_gtod = ktime_to_ns(ktime_get());
+ now = sched_clock();
+
+ __raw_spin_lock(&scd->lock);
+ scd->tick_raw = now;
+ scd->tick_gtod = now_gtod;
+ __update_sched_clock(scd, now);
+ __raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+ sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+ if (timekeeping_suspended)
+ return;
+
+ sched_clock_tick();
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+ sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+ if (unlikely(!sched_clock_running))
+ return 0;
+
+ return sched_clock();
+}
+
+#endif
+
+unsigned long long cpu_clock(int cpu)
+{
+ unsigned long long clock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ clock = sched_clock_cpu(cpu);
+ local_irq_restore(flags);
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 0000000..52154fe
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ * kernel/sched_cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007-2008 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+ int cpupri;
+
+ if (prio == CPUPRI_INVALID)
+ cpupri = CPUPRI_INVALID;
+ else if (prio == MAX_PRIO)
+ cpupri = CPUPRI_IDLE;
+ else if (prio >= MAX_RT_PRIO)
+ cpupri = CPUPRI_NORMAL;
+ else
+ cpupri = MAX_RT_PRIO - prio + 1;
+
+ return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx) \
+ for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
+ idx < CPUPRI_NR_PRIORITIES; \
+ idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ cpumask_t *lowest_mask)
+{
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
+
+ for_each_cpupri_active(cp->pri_active, idx) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ cpumask_t mask;
+
+ if (idx >= task_pri)
+ break;
+
+ cpus_and(mask, p->cpus_allowed, vec->mask);
+
+ if (cpus_empty(mask))
+ continue;
+
+ *lowest_mask = mask;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ unsigned long flags;
+
+ newpri = convert_prio(newpri);
+
+ BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+ if (newpri == oldpri)
+ return;
+
+ /*
+ * If the cpu was currently mapped to a different value, we
+ * first need to unmap the old value
+ */
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ vec->count--;
+ if (!vec->count)
+ clear_bit(oldpri, cp->pri_active);
+ cpu_clear(cpu, vec->mask);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ if (likely(newpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ cpu_set(cpu, vec->mask);
+ vec->count++;
+ if (vec->count == 1)
+ set_bit(newpri, cp->pri_active);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ spin_lock_init(&vec->lock);
+ vec->count = 0;
+ cpus_clear(vec->mask);
+ }
+
+ for_each_possible_cpu(i)
+ cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 0000000..f25811b
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+ spinlock_t lock;
+ int count;
+ cpumask_t mask;
+};
+
+struct cpupri {
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ long pri_active[CPUPRI_NR_PRI_WORDS];
+ int cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int cpupri_find(struct cpupri *cp,
+ struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 0000000..26ed8e3
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,469 @@
+/*
+ * kernel/time/sched_debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(unsigned long long nsec)
+{
+ if ((long long)nsec < 0) {
+ nsec = -nsec;
+ do_div(nsec, 1000000);
+ return -nsec;
+ }
+ do_div(nsec, 1000000);
+
+ return nsec;
+}
+
+static unsigned long nsec_low(unsigned long long nsec)
+{
+ if ((long long)nsec < 0)
+ nsec = -nsec;
+
+ return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+ if (rq->curr == p)
+ SEQ_printf(m, "R");
+ else
+ SEQ_printf(m, " ");
+
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ p->comm, p->pid,
+ SPLIT_NS(p->se.vruntime),
+ (long long)(p->nvcsw + p->nivcsw),
+ p->prio);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.sum_sleep_runtime));
+#else
+ SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
+ 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ {
+ char path[64];
+
+ cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+ SEQ_printf(m, " %s", path);
+ }
+#endif
+ SEQ_printf(m, "\n");
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+
+ SEQ_printf(m,
+ "\nrunnable tasks:\n"
+ " task PID tree-key switches prio"
+ " exec-runtime sum-exec sum-sleep\n"
+ "------------------------------------------------------"
+ "----------------------------------------------------\n");
+
+ read_lock_irqsave(&tasklist_lock, flags);
+
+ do_each_thread(g, p) {
+ if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ continue;
+
+ print_task(m, rq, p);
+ } while_each_thread(g, p);
+
+ read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+ s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ spread, rq0_min_vruntime, spread0;
+ struct rq *rq = &per_cpu(runqueues, cpu);
+ struct sched_entity *last;
+ unsigned long flags;
+
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+ char path[128] = "";
+ struct cgroup *cgroup = NULL;
+ struct task_group *tg = cfs_rq->tg;
+
+ if (tg)
+ cgroup = tg->css.cgroup;
+
+ if (cgroup)
+ cgroup_path(cgroup, path, sizeof(path));
+
+ SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#else
+ SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
+ SPLIT_NS(cfs_rq->exec_clock));
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (cfs_rq->rb_leftmost)
+ MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ last = __pick_last_entity(cfs_rq);
+ if (last)
+ max_vruntime = last->vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
+ rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ SPLIT_NS(max_vruntime));
+ spread = max_vruntime - MIN_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ SPLIT_NS(spread));
+ spread0 = min_vruntime - rq0_min_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
+ cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
+}
+
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+ char path[128] = "";
+ struct cgroup *cgroup = NULL;
+ struct task_group *tg = rt_rq->tg;
+
+ if (tg)
+ cgroup = tg->css.cgroup;
+
+ if (cgroup)
+ cgroup_path(cgroup, path, sizeof(path));
+
+ SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
+#else
+ SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+
+ P(rt_nr_running);
+ P(rt_throttled);
+ PN(rt_time);
+ PN(rt_runtime);
+
+#undef PN
+#undef P
+}
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+ struct rq *rq = &per_cpu(runqueues, cpu);
+
+#ifdef CONFIG_X86
+ {
+ unsigned int freq = cpu_khz ? : 1;
+
+ SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ cpu, freq / 1000, (freq % 1000));
+ }
+#else
+ SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+ P(nr_running);
+ SEQ_printf(m, " .%-30s: %lu\n", "load",
+ rq->load.weight);
+ P(nr_switches);
+ P(nr_load_updates);
+ P(nr_uninterruptible);
+ SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
+ PN(next_balance);
+ P(curr->pid);
+ PN(clock);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+ P(cpu_load[3]);
+ P(cpu_load[4]);
+#undef P
+#undef PN
+
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+
+ P(yld_exp_empty);
+ P(yld_act_empty);
+ P(yld_both_empty);
+ P(yld_count);
+
+ P(sched_switch);
+ P(sched_count);
+ P(sched_goidle);
+
+ P(ttwu_count);
+ P(ttwu_local);
+
+ P(bkl_count);
+
+#undef P
+#endif
+ print_cfs_stats(m, cpu);
+ print_rt_stats(m, cpu);
+
+ print_rq(m, rq, cpu);
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+
+#define P(x) \
+ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(sysctl_sched_latency);
+ PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_child_runs_first);
+ P(sysctl_sched_features);
+#undef PN
+#undef P
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu);
+
+ SEQ_printf(m, "\n");
+
+ return 0;
+}
+
+static void sysrq_sched_debug_show(void)
+{
+ sched_debug_show(NULL, NULL);
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_debug_show, NULL);
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+ unsigned long nr_switches;
+ unsigned long flags;
+ int num_threads = 1;
+
+ if (lock_task_sighand(p, &flags)) {
+ num_threads = atomic_read(&p->signal->count);
+ unlock_task_sighand(p, &flags);
+ }
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m,
+ "---------------------------------------------------------\n");
+#define __P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+ PN(se.exec_start);
+ PN(se.vruntime);
+ PN(se.sum_exec_runtime);
+ PN(se.avg_overlap);
+
+ nr_switches = p->nvcsw + p->nivcsw;
+
+#ifdef CONFIG_SCHEDSTATS
+ PN(se.wait_start);
+ PN(se.sleep_start);
+ PN(se.block_start);
+ PN(se.sleep_max);
+ PN(se.block_max);
+ PN(se.exec_max);
+ PN(se.slice_max);
+ PN(se.wait_max);
+ PN(se.wait_sum);
+ P(se.wait_count);
+ P(sched_info.bkl_count);
+ P(se.nr_migrations);
+ P(se.nr_migrations_cold);
+ P(se.nr_failed_migrations_affine);
+ P(se.nr_failed_migrations_running);
+ P(se.nr_failed_migrations_hot);
+ P(se.nr_forced_migrations);
+ P(se.nr_forced2_migrations);
+ P(se.nr_wakeups);
+ P(se.nr_wakeups_sync);
+ P(se.nr_wakeups_migrate);
+ P(se.nr_wakeups_local);
+ P(se.nr_wakeups_remote);
+ P(se.nr_wakeups_affine);
+ P(se.nr_wakeups_affine_attempts);
+ P(se.nr_wakeups_passive);
+ P(se.nr_wakeups_idle);
+
+ {
+ u64 avg_atom, avg_per_cpu;
+
+ avg_atom = p->se.sum_exec_runtime;
+ if (nr_switches)
+ do_div(avg_atom, nr_switches);
+ else
+ avg_atom = -1LL;
+
+ avg_per_cpu = p->se.sum_exec_runtime;
+ if (p->se.nr_migrations) {
+ avg_per_cpu = div64_u64(avg_per_cpu,
+ p->se.nr_migrations);
+ } else {
+ avg_per_cpu = -1LL;
+ }
+
+ __PN(avg_atom);
+ __PN(avg_per_cpu);
+ }
+#endif
+ __P(nr_switches);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_voluntary_switches", (long long)p->nvcsw);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_involuntary_switches", (long long)p->nivcsw);
+
+ P(se.load.weight);
+ P(policy);
+ P(prio);
+#undef PN
+#undef __PN
+#undef P
+#undef __P
+
+ {
+ unsigned int this_cpu = raw_smp_processor_id();
+ u64 t0, t1;
+
+ t0 = cpu_clock(this_cpu);
+ t1 = cpu_clock(this_cpu);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "clock-delta", (long long)(t1-t0));
+ }
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_max = 0;
+ p->se.wait_sum = 0;
+ p->se.wait_count = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+ p->se.nr_migrations = 0;
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+ p->sched_info.bkl_count = 0;
+#endif
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->nvcsw = 0;
+ p->nivcsw = 0;
+}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 0000000..c91973d
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1759 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/latencytop.h>
+
+/*
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
+ *
+ * (to see the precise effective timeslice length of your workload,
+ * run vmstat and monitor the context-switches (cs) field)
+ */
+unsigned int sysctl_sched_latency = 20000000ULL;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_min_granularity = 4000000ULL;
+
+/*
+ * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ */
+static unsigned int sched_nr_latency = 5;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
+
+/*
+ * sys_sched_yield() compat mode
+ *
+ * This option switches the agressive yield implementation of the
+ * old scheduler back on.
+ */
+unsigned int __read_mostly sysctl_sched_compat_yield;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
+static const struct sched_class fair_sched_class;
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+ for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ if (se->cfs_rq == pse->cfs_rq)
+ return 1;
+
+ return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return se->parent;
+}
+
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+ int depth = 0;
+
+ for_each_sched_entity(se)
+ depth++;
+
+ return depth;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = depth_se(*se);
+ pse_depth = depth_se(*pse);
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+
+#define entity_is_task(se) 1
+
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+ return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return NULL;
+}
+
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta < 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return se->vruntime - cfs_rq->min_vruntime;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (!cfs_rq->curr)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_entity *entry;
+ s64 key = entity_key(cfs_rq, se);
+ int leftmost = 1;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_entity, run_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (key < entity_key(cfs_rq, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used):
+ */
+ if (leftmost)
+ cfs_rq->rb_leftmost = &se->run_node;
+
+ rb_link_node(&se->run_node, parent, link);
+ rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->rb_leftmost == &se->run_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&se->run_node);
+ cfs_rq->rb_leftmost = next_node;
+ }
+
+ rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
+}
+
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+
+ if (!last)
+ return NULL;
+
+ return rb_entry(last, struct sched_entity, run_node);
+}
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+int sched_nr_latency_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+ sysctl_sched_min_granularity);
+
+ return 0;
+}
+#endif
+
+/*
+ * delta *= P[w / rw]
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ delta = calc_delta_mine(delta,
+ se->load.weight, &cfs_rq_of(se)->load);
+ }
+
+ return delta;
+}
+
+/*
+ * delta /= w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+ u64 period = sysctl_sched_latency;
+ unsigned long nr_latency = sched_nr_latency;
+
+ if (unlikely(nr_running > nr_latency)) {
+ period = sysctl_sched_min_granularity;
+ period *= nr_running;
+ }
+
+ return period;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*P[w/rw]
+ */
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long nr_running = cfs_rq->nr_running;
+
+ if (unlikely(!se->on_rq))
+ nr_running++;
+
+ return calc_delta_weight(__sched_period(nr_running), se);
+}
+
+/*
+ * We calculate the vruntime slice of a to be inserted task
+ *
+ * vs = s/w
+ */
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return calc_delta_fair(sched_slice(cfs_rq, se), se);
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+ unsigned long delta_exec)
+{
+ unsigned long delta_exec_weighted;
+
+ schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+ delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+ curr->vruntime += delta_exec_weighted;
+ update_min_vruntime(cfs_rq);
+}
+
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 now = rq_of(cfs_rq)->clock;
+ unsigned long delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ /*
+ * Get the amount of time the current task was running
+ * since the last time we changed load (this cannot
+ * overflow on 32 bits):
+ */
+ delta_exec = (unsigned long)(now - curr->exec_start);
+
+ __update_curr(cfs_rq, curr, delta_exec);
+ curr->exec_start = now;
+
+ if (entity_is_task(curr)) {
+ struct task_struct *curtask = task_of(curr);
+
+ cpuacct_charge(curtask, delta_exec);
+ account_group_exec_runtime(curtask, delta_exec);
+ }
+}
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Are we enqueueing a waiting task? (for current tasks
+ * a dequeue/enqueue event is a NOP)
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_start(cfs_rq, se);
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->wait_max, max(se->wait_max,
+ rq_of(cfs_rq)->clock - se->wait_start));
+ schedstat_set(se->wait_count, se->wait_count + 1);
+ schedstat_set(se->wait_sum, se->wait_sum +
+ rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->wait_start, 0);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Mark the end of the wait period if dequeueing a
+ * waiting task:
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_end(cfs_rq, se);
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * We are starting a new run period:
+ */
+ se->exec_start = rq_of(cfs_rq)->clock;
+}
+
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+ cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+ if (entity_is_task(se)) {
+ add_cfs_task_weight(cfs_rq, se->load.weight);
+ list_add(&se->group_node, &cfs_rq->tasks);
+ }
+ cfs_rq->nr_running++;
+ se->on_rq = 1;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+ if (entity_is_task(se)) {
+ add_cfs_task_weight(cfs_rq, -se->load.weight);
+ list_del_init(&se->group_node);
+ }
+ cfs_rq->nr_running--;
+ se->on_rq = 0;
+}
+
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (se->sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ struct task_struct *tsk = task_of(se);
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->sleep_max))
+ se->sleep_max = delta;
+
+ se->sleep_start = 0;
+ se->sum_sleep_runtime += delta;
+
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ }
+ if (se->block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ struct task_struct *tsk = task_of(se);
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->block_max))
+ se->block_max = delta;
+
+ se->block_start = 0;
+ se->sum_sleep_runtime += delta;
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by 20 to
+ * get a milliseconds-range estimation of the amount of
+ * time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+#endif
+}
+
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+ if (d < 0)
+ d = -d;
+
+ if (d > 3*sysctl_sched_latency)
+ schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ /*
+ * The 'current' period is already promised to the current tasks,
+ * however the extra weight of the new task will slow them down a
+ * little, place the new task so that it fits in the slot that
+ * stays open at the end.
+ */
+ if (initial && sched_feat(START_DEBIT))
+ vruntime += sched_vslice(cfs_rq, se);
+
+ if (!initial) {
+ /* sleeps upto a single latency don't count. */
+ if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ unsigned long thresh = sysctl_sched_latency;
+
+ /*
+ * Convert the sleeper threshold into virtual time.
+ * SCHED_IDLE is a special sub-class. We care about
+ * fairness only relative to other SCHED_IDLE tasks,
+ * all of which have the same weight.
+ */
+ if (sched_feat(NORMALIZED_SLEEPER) &&
+ task_of(se)->policy != SCHED_IDLE)
+ thresh = calc_delta_fair(thresh, se);
+
+ vruntime -= thresh;
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ vruntime = max_vruntime(se->vruntime, vruntime);
+ }
+
+ se->vruntime = vruntime;
+}
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ account_entity_enqueue(cfs_rq, se);
+
+ if (wakeup) {
+ place_entity(cfs_rq, se, 0);
+ enqueue_sleeper(cfs_rq, se);
+ }
+
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+}
+
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+}
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ update_stats_dequeue(cfs_rq, se);
+ if (sleep) {
+#ifdef CONFIG_SCHEDSTATS
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->sleep_start = rq_of(cfs_rq)->clock;
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->block_start = rq_of(cfs_rq)->clock;
+ }
+#endif
+ }
+
+ clear_buddies(cfs_rq, se);
+
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ account_entity_dequeue(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ unsigned long ideal_runtime, delta_exec;
+
+ ideal_runtime = sched_slice(cfs_rq, curr);
+ delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ if (delta_exec > ideal_runtime)
+ resched_task(rq_of(cfs_rq)->curr);
+}
+
+static void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /* 'current' is not kept within the tree. */
+ if (se->on_rq) {
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue.
+ */
+ update_stats_wait_end(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
+ }
+
+ update_stats_curr_start(cfs_rq, se);
+ cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+ /*
+ * Track our maximum slice length, if the CPU's load is at
+ * least twice that of our own weight (i.e. dont track it
+ * when there are only lesser-weight tasks around):
+ */
+ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ se->slice_max = max(se->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ }
+#endif
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
+
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+ return cfs_rq->next;
+
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+ return cfs_rq->last;
+
+ return se;
+}
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+ /*
+ * If still on the runqueue then deactivate_task()
+ * was not called and update_curr() has to be done:
+ */
+ if (prev->on_rq)
+ update_curr(cfs_rq);
+
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
+ /* Put 'current' back into the tree. */
+ __enqueue_entity(cfs_rq, prev);
+ }
+ cfs_rq->curr = NULL;
+}
+
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+ /*
+ * queued ticks are scheduled to match the slice, so don't bother
+ * validating it and just reschedule.
+ */
+ if (queued) {
+ resched_task(rq_of(cfs_rq)->curr);
+ return;
+ }
+ /*
+ * don't let the period tick interfere with the hrtick preemption
+ */
+ if (!sched_feat(DOUBLE_TICK) &&
+ hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+ return;
+#endif
+
+ if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+ check_preempt_tick(cfs_rq, curr);
+}
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ WARN_ON(task_rq(p) != rq);
+
+ if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+ u64 slice = sched_slice(cfs_rq, se);
+ u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ s64 delta = slice - ran;
+
+ if (delta < 0) {
+ if (rq->curr == p)
+ resched_task(p);
+ return;
+ }
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ if (rq->curr != p)
+ delta = max_t(s64, 10000LL, delta);
+
+ hrtick_start(rq, delta);
+ }
+}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ if (curr->sched_class != &fair_sched_class)
+ return;
+
+ if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ hrtick_start_fair(rq, curr);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
+#endif
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, wakeup);
+ wakeup = 1;
+ }
+
+ hrtick_update(rq);
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dequeue_entity(cfs_rq, se, sleep);
+ /* Don't dequeue parent if it has other entities besides us */
+ if (cfs_rq->load.weight)
+ break;
+ sleep = 1;
+ }
+
+ hrtick_update(rq);
+}
+
+/*
+ * sched_yield() support is very simple - we dequeue and enqueue.
+ *
+ * If compat_yield is turned on then we requeue to the end of the tree.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *rightmost, *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(cfs_rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ return;
+ }
+ /*
+ * Find the rightmost entry in the rbtree:
+ */
+ rightmost = __pick_last_entity(cfs_rq);
+ /*
+ * Already in the rightmost position?
+ */
+ if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
+ return;
+
+ /*
+ * Minimally necessary key value to be last in the tree:
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ se->vruntime = rightmost->vruntime + 1;
+}
+
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available. The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+ cpumask_t tmp;
+ struct sched_domain *sd;
+ int i;
+
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
+ return cpu;
+
+ for_each_domain(cpu, sd) {
+ if ((sd->flags & SD_WAKE_IDLE)
+ || ((sd->flags & SD_WAKE_IDLE_FAR)
+ && !task_hot(p, task_rq(p)->clock, sd))) {
+ cpus_and(tmp, sd->span, p->cpus_allowed);
+ cpus_and(tmp, tmp, cpu_active_map);
+ for_each_cpu_mask_nr(i, tmp) {
+ if (idle_cpu(i)) {
+ if (i != task_cpu(p)) {
+ schedstat_inc(p,
+ se.nr_wakeups_idle);
+ }
+ return i;
+ }
+ }
+ } else {
+ break;
+ }
+ }
+ return cpu;
+}
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+ return cpu;
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+ long wl, long wg)
+{
+ struct sched_entity *se = tg->se[cpu];
+
+ if (!tg->parent)
+ return wl;
+
+ /*
+ * By not taking the decrease of shares on the other cpu into
+ * account our error leans towards reducing the affine wakeups.
+ */
+ if (!wl && sched_feat(ASYM_EFF_LOAD))
+ return wl;
+
+ for_each_sched_entity(se) {
+ long S, rw, s, a, b;
+ long more_w;
+
+ /*
+ * Instead of using this increment, also add the difference
+ * between when the shares were last updated and now.
+ */
+ more_w = se->my_q->load.weight - se->my_q->rq_weight;
+ wl += more_w;
+ wg += more_w;
+
+ S = se->my_q->tg->shares;
+ s = se->my_q->shares;
+ rw = se->my_q->rq_weight;
+
+ a = S*(rw + wl);
+ b = S*rw + s*wg;
+
+ wl = s*(a-b);
+
+ if (likely(b))
+ wl /= b;
+
+ /*
+ * Assume the group is already running and will
+ * thus already be accounted for in the weight.
+ *
+ * That is, moving shares between CPUs, does not
+ * alter the group weight.
+ */
+ wg = 0;
+ }
+
+ return wl;
+}
+
+#else
+
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+ unsigned long wl, unsigned long wg)
+{
+ return wl;
+}
+
+#endif
+
+static int
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
+ struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+ int idx, unsigned long load, unsigned long this_load,
+ unsigned int imbalance)
+{
+ struct task_struct *curr = this_rq->curr;
+ struct task_group *tg;
+ unsigned long tl = this_load;
+ unsigned long tl_per_task;
+ unsigned long weight;
+ int balanced;
+
+ if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
+ return 0;
+
+ if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+ p->se.avg_overlap > sysctl_sched_migration_cost))
+ sync = 0;
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ tg = task_group(current);
+ weight = current->se.load.weight;
+
+ tl += effective_load(tg, this_cpu, -weight, -weight);
+ load += effective_load(tg, prev_cpu, 0, -weight);
+ }
+
+ tg = task_group(p);
+ weight = p->se.load.weight;
+
+ balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+ imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+
+ /*
+ * If the currently running task will sleep within
+ * a reasonable amount of time then attract this newly
+ * woken task:
+ */
+ if (sync && balanced)
+ return 1;
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+ if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+ tl_per_task)) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
+
+ return 1;
+ }
+ return 0;
+}
+
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+ struct sched_domain *sd, *this_sd = NULL;
+ int prev_cpu, this_cpu, new_cpu;
+ unsigned long load, this_load;
+ struct rq *this_rq;
+ unsigned int imbalance;
+ int idx;
+
+ prev_cpu = task_cpu(p);
+ this_cpu = smp_processor_id();
+ this_rq = cpu_rq(this_cpu);
+ new_cpu = prev_cpu;
+
+ if (prev_cpu == this_cpu)
+ goto out;
+ /*
+ * 'this_sd' is the first domain that both
+ * this_cpu and prev_cpu are present in:
+ */
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(prev_cpu, sd->span)) {
+ this_sd = sd;
+ break;
+ }
+ }
+
+ if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ goto out;
+
+ /*
+ * Check for affine wakeup and passive balancing possibilities.
+ */
+ if (!this_sd)
+ goto out;
+
+ idx = this_sd->wake_idx;
+
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ return this_cpu;
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
+ return this_cpu;
+ }
+ }
+
+out:
+ return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+
+static unsigned long wakeup_gran(struct sched_entity *se)
+{
+ unsigned long gran = sysctl_sched_wakeup_granularity;
+
+ /*
+ * More easily preempt - nice tasks, while not making it harder for
+ * + nice tasks.
+ */
+ if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+
+ return gran;
+}
+
+/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(curr);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+static void set_last_buddy(struct sched_entity *se)
+{
+ if (likely(task_of(se)->policy != SCHED_IDLE)) {
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+ }
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ if (likely(task_of(se)->policy != SCHED_IDLE)) {
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+ }
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_entity *se = &curr->se, *pse = &p->se;
+
+ if (unlikely(rt_prio(p->prio))) {
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
+ update_rq_clock(rq);
+ update_curr(cfs_rq);
+ resched_task(curr);
+ return;
+ }
+
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return;
+
+ if (unlikely(se == pse))
+ return;
+
+ /*
+ * Only set the backward buddy when the current task is still on the
+ * rq. This can happen when a wakeup gets interleaved with schedule on
+ * the ->pre_schedule() or idle_balance() point, either of which can
+ * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class, for
+ * obvious reasons its a bad idea to schedule back to the idle thread.
+ */
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ set_last_buddy(se);
+ set_next_buddy(pse);
+
+ /*
+ * We can come here with TIF_NEED_RESCHED already set from new task
+ * wake up path.
+ */
+ if (test_tsk_need_resched(curr))
+ return;
+
+ /*
+ * Batch and idle tasks do not preempt (their preemption is driven by
+ * the tick):
+ */
+ if (unlikely(p->policy != SCHED_NORMAL))
+ return;
+
+ /* Idle tasks are by definition preempted by everybody. */
+ if (unlikely(curr->policy == SCHED_IDLE)) {
+ resched_task(curr);
+ return;
+ }
+
+ if (!sched_feat(WAKEUP_PREEMPT))
+ return;
+
+ if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+ (se->avg_overlap < sysctl_sched_migration_cost &&
+ pse->avg_overlap < sysctl_sched_migration_cost))) {
+ resched_task(curr);
+ return;
+ }
+
+ find_matching_se(&se, &pse);
+
+ while (se) {
+ BUG_ON(!pse);
+
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ resched_task(curr);
+ break;
+ }
+
+ se = parent_entity(se);
+ pse = parent_entity(pse);
+ }
+}
+
+static struct task_struct *pick_next_task_fair(struct rq *rq)
+{
+ struct task_struct *p;
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+
+ if (unlikely(!cfs_rq->nr_running))
+ return NULL;
+
+ do {
+ se = pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+ hrtick_start_fair(rq, p);
+
+ return p;
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
+#ifdef CONFIG_SMP
+/**************************************************
+ * Fair scheduling class load-balancing methods:
+ */
+
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static struct task_struct *
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+{
+ struct task_struct *p = NULL;
+ struct sched_entity *se;
+
+ if (next == &cfs_rq->tasks)
+ return NULL;
+
+ se = list_entry(next, struct sched_entity, group_node);
+ p = task_of(se);
+ cfs_rq->balance_iterator = next->next;
+
+ return p;
+}
+
+static struct task_struct *load_balance_start_fair(void *arg)
+{
+ struct cfs_rq *cfs_rq = arg;
+
+ return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+}
+
+static struct task_struct *load_balance_next_fair(void *arg)
+{
+ struct cfs_rq *cfs_rq = arg;
+
+ return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+}
+
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+ struct cfs_rq *cfs_rq)
+{
+ struct rq_iterator cfs_rq_iterator;
+
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
+ cfs_rq_iterator.arg = cfs_rq;
+
+ return balance_tasks(this_rq, this_cpu, busiest,
+ max_load_move, sd, idle, all_pinned,
+ this_best_prio, &cfs_rq_iterator);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ long rem_load_move = max_load_move;
+ int busiest_cpu = cpu_of(busiest);
+ struct task_group *tg;
+
+ rcu_read_lock();
+ update_h_load(busiest_cpu);
+
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+ unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+ unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+ u64 rem_load, moved_load;
+
+ /*
+ * empty group
+ */
+ if (!busiest_cfs_rq->task_weight)
+ continue;
+
+ rem_load = (u64)rem_load_move * busiest_weight;
+ rem_load = div_u64(rem_load, busiest_h_load + 1);
+
+ moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+ rem_load, sd, idle, all_pinned, this_best_prio,
+ tg->cfs_rq[busiest_cpu]);
+
+ if (!moved_load)
+ continue;
+
+ moved_load *= busiest_h_load;
+ moved_load = div_u64(moved_load, busiest_weight + 1);
+
+ rem_load_move -= moved_load;
+ if (rem_load_move < 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ return max_load_move - rem_load_move;
+}
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ return __load_balance_fair(this_rq, this_cpu, busiest,
+ max_load_move, sd, idle, all_pinned,
+ this_best_prio, &busiest->cfs);
+}
+#endif
+
+static int
+move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ struct cfs_rq *busy_cfs_rq;
+ struct rq_iterator cfs_rq_iterator;
+
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
+
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+ &cfs_rq_iterator))
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &curr->se;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ entity_tick(cfs_rq, se, queued);
+ }
+}
+
+#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
+
+/*
+ * Share the fairness runtime between parent and child, thus the
+ * total amount of pressure for CPU stays equal - new tasks
+ * get a chance to run but frequent forkers are not allowed to
+ * monopolize the CPU. Note: the parent runqueue is locked,
+ * the child is not running yet.
+ */
+static void task_new_fair(struct rq *rq, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+ int this_cpu = smp_processor_id();
+
+ sched_info_queued(p);
+
+ update_curr(cfs_rq);
+ place_entity(cfs_rq, se, 1);
+
+ /* 'curr' will be NULL if the child belongs to a different group */
+ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
+ curr && curr->vruntime < se->vruntime) {
+ /*
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ swap(curr->vruntime, se->vruntime);
+ resched_task(rq->curr);
+ }
+
+ enqueue_task_fair(rq, p, 0);
+}
+
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+ int oldprio, int running)
+{
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (running) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (running)
+ resched_task(rq->curr);
+ else
+ check_preempt_curr(rq, p, 0);
+}
+
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+ struct sched_entity *se = &rq->curr->se;
+
+ for_each_sched_entity(se)
+ set_next_entity(cfs_rq_of(se), se);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ update_curr(cfs_rq);
+ place_entity(cfs_rq, &p->se, 1);
+}
+#endif
+
+/*
+ * All the scheduling class methods:
+ */
+static const struct sched_class fair_sched_class = {
+ .next = &idle_sched_class,
+ .enqueue_task = enqueue_task_fair,
+ .dequeue_task = dequeue_task_fair,
+ .yield_task = yield_task_fair,
+
+ .check_preempt_curr = check_preempt_wakeup,
+
+ .pick_next_task = pick_next_task_fair,
+ .put_prev_task = put_prev_task_fair,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+
+ .load_balance = load_balance_fair,
+ .move_one_task = move_one_task_fair,
+#endif
+
+ .set_curr_task = set_curr_task_fair,
+ .task_tick = task_tick_fair,
+ .task_new = task_new_fair,
+
+ .prio_changed = prio_changed_fair,
+ .switched_to = switched_to_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .moved_group = moved_group_fair,
+#endif
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+static void print_cfs_stats(struct seq_file *m, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);
+ rcu_read_unlock();
+}
+#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
new file mode 100644
index 0000000..da5d93b
--- /dev/null
+++ b/kernel/sched_features.h
@@ -0,0 +1,15 @@
+SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(WAKEUP_PREEMPT, 1)
+SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(AFFINE_WAKEUPS, 1)
+SCHED_FEAT(CACHE_HOT_BUDDY, 1)
+SCHED_FEAT(SYNC_WAKEUPS, 1)
+SCHED_FEAT(HRTICK, 0)
+SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(ASYM_GRAN, 1)
+SCHED_FEAT(LB_BIAS, 1)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 0000000..8a21a2e
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,128 @@
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ * handled in sched_fair.c)
+ */
+
+#ifdef CONFIG_SMP
+static int select_task_rq_idle(struct task_struct *p, int sync)
+{
+ return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
+{
+ resched_task(rq->idle);
+}
+
+static struct task_struct *pick_next_task_idle(struct rq *rq)
+{
+ schedstat_inc(rq, sched_goidle);
+
+ return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+{
+ spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+#ifdef CONFIG_SMP
+static unsigned long
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ return 0;
+}
+
+static int
+move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ return 0;
+}
+#endif
+
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ /* Can this actually happen?? */
+ if (running)
+ resched_task(rq->curr);
+ else
+ check_preempt_curr(rq, p, 0);
+}
+
+static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+ int oldprio, int running)
+{
+ /* This can happen for hot plug CPUS */
+
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (running) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+static const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .check_preempt_curr = check_preempt_curr_idle,
+
+ .pick_next_task = pick_next_task_idle,
+ .put_prev_task = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+
+ .load_balance = load_balance_idle,
+ .move_one_task = move_one_task_idle,
+#endif
+
+ .set_curr_task = set_curr_task_idle,
+ .task_tick = task_tick_idle,
+
+ .prio_changed = prio_changed_idle,
+ .switched_to = switched_to_idle,
+
+ /* no .task_new for idle tasks */
+};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 0000000..d9ba9d5
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,1546 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+
+#ifdef CONFIG_SMP
+
+static inline int rt_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->rto_count);
+}
+
+static inline void rt_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpu_set(rq->cpu, rq->rd->rto_mask);
+ /*
+ * Make sure the mask is visible before we set
+ * the overload count. That is checked to determine
+ * if we should look at the mask. It would be a shame
+ * if we looked at the mask, but the mask was not
+ * updated yet.
+ */
+ wmb();
+ atomic_inc(&rq->rd->rto_count);
+}
+
+static inline void rt_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ /* the order here really doesn't matter */
+ atomic_dec(&rq->rd->rto_count);
+ cpu_clear(rq->cpu, rq->rd->rto_mask);
+}
+
+static void update_rt_migration(struct rq *rq)
+{
+ if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
+ } else if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
+}
+#endif /* CONFIG_SMP */
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ if (!rt_rq->tg)
+ return RUNTIME_INF;
+
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+ list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+ if (rt_rq->rt_nr_running) {
+ if (rt_se && !on_rt_rq(rt_se))
+ enqueue_rt_entity(rt_se);
+ if (rt_rq->highest_prio < curr->prio)
+ resched_task(curr);
+ }
+}
+
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+ if (rt_se && on_rt_rq(rt_se))
+ dequeue_rt_entity(rt_se);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
+#ifdef CONFIG_SMP
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_rq(smp_processor_id())->rd->span;
+}
+#else
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_online_map;
+}
+#endif
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &rt_rq->tg->rt_bandwidth;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(def_rt_bandwidth.rt_period);
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->rt;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ if (rt_rq->rt_nr_running)
+ resched_task(rq_of_rt_rq(rt_rq)->curr);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
+
+static inline cpumask_t sched_rt_period_mask(void)
+{
+ return cpu_online_map;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &def_rt_bandwidth;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
+static int do_balance_runtime(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int i, weight, more = 0;
+ u64 rt_period;
+
+ weight = cpus_weight(rd->span);
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ rt_period = ktime_to_ns(rt_b->rt_period);
+ for_each_cpu_mask_nr(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ if (iter == rt_rq)
+ continue;
+
+ spin_lock(&iter->rt_runtime_lock);
+ /*
+ * Either all rqs have inf runtime and there's nothing to steal
+ * or __disable_runtime() below sets a specific rq to inf to
+ * indicate its been disabled and disalow stealing.
+ */
+ if (iter->rt_runtime == RUNTIME_INF)
+ goto next;
+
+ /*
+ * From runqueues with spare time, take 1/n part of their
+ * spare time, but no more than our period.
+ */
+ diff = iter->rt_runtime - iter->rt_time;
+ if (diff > 0) {
+ diff = div_u64((u64)diff, weight);
+ if (rt_rq->rt_runtime + diff > rt_period)
+ diff = rt_period - rt_rq->rt_runtime;
+ iter->rt_runtime -= diff;
+ rt_rq->rt_runtime += diff;
+ more = 1;
+ if (rt_rq->rt_runtime == rt_period) {
+ spin_unlock(&iter->rt_runtime_lock);
+ break;
+ }
+ }
+next:
+ spin_unlock(&iter->rt_runtime_lock);
+ }
+ spin_unlock(&rt_b->rt_runtime_lock);
+
+ return more;
+}
+
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
+static void __disable_runtime(struct rq *rq)
+{
+ struct root_domain *rd = rq->rd;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ s64 want;
+ int i;
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * Either we're all inf and nobody needs to borrow, or we're
+ * already disabled and thus have nothing to do, or we have
+ * exactly the right amount of runtime to take out.
+ */
+ if (rt_rq->rt_runtime == RUNTIME_INF ||
+ rt_rq->rt_runtime == rt_b->rt_runtime)
+ goto balanced;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+
+ /*
+ * Calculate the difference between what we started out with
+ * and what we current have, that's the amount of runtime
+ * we lend and now have to reclaim.
+ */
+ want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+ /*
+ * Greedy reclaim, take back as much as we can.
+ */
+ for_each_cpu_mask(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ /*
+ * Can't reclaim from ourselves or disabled runqueues.
+ */
+ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
+ continue;
+
+ spin_lock(&iter->rt_runtime_lock);
+ if (want > 0) {
+ diff = min_t(s64, iter->rt_runtime, want);
+ iter->rt_runtime -= diff;
+ want -= diff;
+ } else {
+ iter->rt_runtime -= want;
+ want -= want;
+ }
+ spin_unlock(&iter->rt_runtime_lock);
+
+ if (!want)
+ break;
+ }
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We cannot be left wanting - that would mean some runtime
+ * leaked out of the system.
+ */
+ BUG_ON(want);
+balanced:
+ /*
+ * Disable all the borrow logic by pretending we have inf
+ * runtime - in which case borrowing doesn't make sense.
+ */
+ rt_rq->rt_runtime = RUNTIME_INF;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static void disable_runtime(struct rq *rq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __disable_runtime(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ /*
+ * Reset each runqueue's bandwidth settings
+ */
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static void enable_runtime(struct rq *rq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __enable_runtime(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+ int more = 0;
+
+ if (rt_rq->rt_time > rt_rq->rt_runtime) {
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ more = do_balance_runtime(rt_rq);
+ spin_lock(&rt_rq->rt_runtime_lock);
+ }
+
+ return more;
+}
+#else /* !CONFIG_SMP */
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+ int i, idle = 1;
+ cpumask_t span;
+
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return 1;
+
+ span = sched_rt_period_mask();
+ for_each_cpu_mask(i, span) {
+ int enqueue = 0;
+ struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ spin_lock(&rq->lock);
+ if (rt_rq->rt_time) {
+ u64 runtime;
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ if (rt_rq->rt_throttled)
+ balance_runtime(rt_rq);
+ runtime = rt_rq->rt_runtime;
+ rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+ rt_rq->rt_throttled = 0;
+ enqueue = 1;
+ }
+ if (rt_rq->rt_time || rt_rq->rt_nr_running)
+ idle = 0;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ } else if (rt_rq->rt_nr_running)
+ idle = 0;
+
+ if (enqueue)
+ sched_rt_rq_enqueue(rt_rq);
+ spin_unlock(&rq->lock);
+ }
+
+ return idle;
+}
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+{
+ u64 runtime = sched_rt_runtime(rt_rq);
+
+ if (rt_rq->rt_throttled)
+ return rt_rq_throttled(rt_rq);
+
+ if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+ return 0;
+
+ balance_runtime(rt_rq);
+ runtime = sched_rt_runtime(rt_rq);
+ if (runtime == RUNTIME_INF)
+ return 0;
+
+ if (rt_rq->rt_time > runtime) {
+ rt_rq->rt_throttled = 1;
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_rq_dequeue(rt_rq);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr_rt(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_rt_entity *rt_se = &curr->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 delta_exec;
+
+ if (!task_has_rt_policy(curr))
+ return;
+
+ delta_exec = rq->clock - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq->clock;
+ cpuacct_charge(curr, delta_exec);
+
+ if (!rt_bandwidth_enabled())
+ return;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_rq = rt_rq_of_se(rt_se);
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+ rt_rq->rt_time += delta_exec;
+ if (sched_rt_runtime_exceeded(rt_rq))
+ resched_task(curr);
+ }
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+}
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
+
+ rt_rq->highest_prio = rt_se_prio(rt_se);
+#ifdef CONFIG_SMP
+ if (rq->online)
+ cpupri_set(&rq->rd->cpupri, rq->cpu,
+ rt_se_prio(rt_se));
+#endif
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (rt_se->nr_cpus_allowed > 1) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ rq->rt.rt_nr_migratory++;
+ }
+
+ update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+#else
+ start_rt_bandwidth(&def_rt_bandwidth);
+#endif
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+#ifdef CONFIG_SMP
+ int highest_prio = rt_rq->highest_prio;
+#endif
+
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running--;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ if (rt_rq->rt_nr_running) {
+ struct rt_prio_array *array;
+
+ WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
+ if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
+ /* recalculate */
+ array = &rt_rq->active;
+ rt_rq->highest_prio =
+ sched_find_first_bit(array->bitmap);
+ } /* otherwise leave rq->highest prio alone */
+ } else
+ rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+ if (rt_se->nr_cpus_allowed > 1) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ rq->rt.rt_nr_migratory--;
+ }
+
+ if (rt_rq->highest_prio != highest_prio) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (rq->online)
+ cpupri_set(&rq->rd->cpupri, rq->cpu,
+ rt_rq->highest_prio);
+ }
+
+ update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ /*
+ * Don't enqueue the group if its throttled, or when empty.
+ * The latter is a consequence of the former when a child group
+ * get throttled and the current group doesn't have any other
+ * active members.
+ */
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ return;
+
+ list_add_tail(&rt_se->run_list, queue);
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+
+ inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+
+ list_del_init(&rt_se->run_list);
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ */
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+{
+ struct sched_rt_entity *back = NULL;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_se->back = back;
+ back = rt_se;
+ }
+
+ for (rt_se = back; rt_se; rt_se = rt_se->back) {
+ if (on_rt_rq(rt_se))
+ __dequeue_rt_entity(rt_se);
+ }
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ dequeue_rt_stack(rt_se);
+ for_each_sched_rt_entity(rt_se)
+ __enqueue_rt_entity(rt_se);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ dequeue_rt_stack(rt_se);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq && rt_rq->rt_nr_running)
+ __enqueue_rt_entity(rt_se);
+ }
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ if (wakeup)
+ rt_se->timeout = 0;
+
+ enqueue_rt_entity(rt_se);
+
+ inc_cpu_load(rq, p->se.load.weight);
+}
+
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ update_curr_rt(rq);
+ dequeue_rt_entity(rt_se);
+
+ dec_cpu_load(rq, p->se.load.weight);
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
+{
+ if (on_rt_rq(rt_se)) {
+ struct rt_prio_array *array = &rt_rq->active;
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ if (head)
+ list_move(&rt_se->run_list, queue);
+ else
+ list_move_tail(&rt_se->run_list, queue);
+ }
+}
+
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_rq = rt_rq_of_se(rt_se);
+ requeue_rt_entity(rt_rq, rt_se, head);
+ }
+}
+
+static void yield_task_rt(struct rq *rq)
+{
+ requeue_task_rt(rq, rq->curr, 0);
+}
+
+#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
+static int select_task_rq_rt(struct task_struct *p, int sync)
+{
+ struct rq *rq = task_rq(p);
+
+ /*
+ * If the current task is an RT task, then
+ * try to see if we can wake this RT task up on another
+ * runqueue. Otherwise simply start this RT task
+ * on its current runqueue.
+ *
+ * We want to avoid overloading runqueues. Even if
+ * the RT task is of higher priority than the current RT task.
+ * RT tasks behave differently than other tasks. If
+ * one gets preempted, we try to push it off to another queue.
+ * So trying to keep a preempting RT task on the same
+ * cache hot CPU will force the running RT task to
+ * a cold CPU. So we waste all the cache for the lower
+ * RT task in hopes of saving some of a RT task
+ * that is just being woken and probably will have
+ * cold cache anyway.
+ */
+ if (unlikely(rt_task(rq->curr)) &&
+ (p->rt.nr_cpus_allowed > 1)) {
+ int cpu = find_lowest_rq(p);
+
+ return (cpu == -1) ? task_cpu(p) : cpu;
+ }
+
+ /*
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ */
+ return task_cpu(p);
+}
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+ cpumask_t mask;
+
+ if (rq->curr->rt.nr_cpus_allowed == 1)
+ return;
+
+ if (p->rt.nr_cpus_allowed != 1
+ && cpupri_find(&rq->rd->cpupri, p, &mask))
+ return;
+
+ if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+ return;
+
+ /*
+ * There appears to be other cpus that can accept
+ * current and none to run 'p', so lets reschedule
+ * to try and push current away:
+ */
+ requeue_task_rt(rq, p, 1);
+ resched_task(rq->curr);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
+{
+ if (p->prio < rq->curr->prio) {
+ resched_task(rq->curr);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If:
+ *
+ * - the newly woken task is of equal priority to the current task
+ * - the newly woken task is non-migratable while current is migratable
+ * - current will be preempted on the next reschedule
+ *
+ * we should check to see if current can readily move to a different
+ * cpu. If so, we will reschedule to allow the push logic to try
+ * to move current somewhere else, making room for our non-migratable
+ * task.
+ */
+ if (p->prio == rq->curr->prio && !need_resched())
+ check_preempt_equal_prio(rq, p);
+#endif
+}
+
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+ struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *next = NULL;
+ struct list_head *queue;
+ int idx;
+
+ idx = sched_find_first_bit(array->bitmap);
+ BUG_ON(idx >= MAX_RT_PRIO);
+
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct sched_rt_entity, run_list);
+
+ return next;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct sched_rt_entity *rt_se;
+ struct task_struct *p;
+ struct rt_rq *rt_rq;
+
+ rt_rq = &rq->rt;
+
+ if (unlikely(!rt_rq->rt_nr_running))
+ return NULL;
+
+ if (rt_rq_throttled(rt_rq))
+ return NULL;
+
+ do {
+ rt_se = pick_next_rt_entity(rq, rt_rq);
+ BUG_ON(!rt_se);
+ rt_rq = group_rt_rq(rt_se);
+ } while (rt_rq);
+
+ p = rt_task_of(rt_se);
+ p->se.exec_start = rq->clock;
+ return p;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+{
+ update_curr_rt(rq);
+ p->se.exec_start = 0;
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+ (p->rt.nr_cpus_allowed > 1))
+ return 1;
+ return 0;
+}
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+{
+ struct task_struct *next = NULL;
+ struct sched_rt_entity *rt_se;
+ struct rt_prio_array *array;
+ struct rt_rq *rt_rq;
+ int idx;
+
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ array = &rt_rq->active;
+ idx = sched_find_first_bit(array->bitmap);
+ next_idx:
+ if (idx >= MAX_RT_PRIO)
+ continue;
+ if (next && next->prio < idx)
+ continue;
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p = rt_task_of(rt_se);
+ if (pick_rt_task(rq, p, cpu)) {
+ next = p;
+ break;
+ }
+ }
+ if (!next) {
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+ goto next_idx;
+ }
+ }
+
+ return next;
+}
+
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+{
+ int first;
+
+ /* "this_cpu" is cheaper to preempt than a remote processor */
+ if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+ return this_cpu;
+
+ first = first_cpu(*mask);
+ if (first != NR_CPUS)
+ return first;
+
+ return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+ int this_cpu = smp_processor_id();
+ int cpu = task_cpu(task);
+
+ if (task->rt.nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */
+
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return -1; /* No targets found */
+
+ /*
+ * Only consider CPUs that are usable for migration.
+ * I guess we might want to change cpupri_find() to ignore those
+ * in the first place.
+ */
+ cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
+ /*
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ *
+ * We prioritize the last cpu that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ if (cpu_isset(cpu, *lowest_mask))
+ return cpu;
+
+ /*
+ * Otherwise, we consult the sched_domains span maps to figure
+ * out which cpu is logically closest to our hot cache data.
+ */
+ if (this_cpu == cpu)
+ this_cpu = -1; /* Skip this_cpu opt if the same */
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ cpumask_t domain_mask;
+ int best_cpu;
+
+ cpus_and(domain_mask, sd->span, *lowest_mask);
+
+ best_cpu = pick_optimal_cpu(this_cpu,
+ &domain_mask);
+ if (best_cpu != -1)
+ return best_cpu;
+ }
+ }
+
+ /*
+ * And finally, if there were no matches within the domains
+ * just give the caller *something* to work with from the compatible
+ * locations.
+ */
+ return pick_optimal_cpu(this_cpu, lowest_mask);
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *lowest_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ cpu = find_lowest_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ lowest_rq = cpu_rq(cpu);
+
+ /* if the prio of this runqueue changed, try again */
+ if (double_lock_balance(rq, lowest_rq)) {
+ /*
+ * We had to unlock the run queue. In
+ * the mean time, task could have
+ * migrated already or had its affinity changed.
+ * Also make sure that it wasn't scheduled on its rq.
+ */
+ if (unlikely(task_rq(task) != rq ||
+ !cpu_isset(lowest_rq->cpu,
+ task->cpus_allowed) ||
+ task_running(rq, task) ||
+ !task->se.on_rq)) {
+
+ spin_unlock(&lowest_rq->lock);
+ lowest_rq = NULL;
+ break;
+ }
+ }
+
+ /* If this rq is still suitable use it. */
+ if (lowest_rq->rt.highest_prio > task->prio)
+ break;
+
+ /* try again */
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rq = NULL;
+ }
+
+ return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *lowest_rq;
+ int ret = 0;
+ int paranoid = RT_MAX_TRIES;
+
+ if (!rq->rt.overloaded)
+ return 0;
+
+ next_task = pick_next_highest_task_rt(rq, -1);
+ if (!next_task)
+ return 0;
+
+ retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * It's possible that the next_task slipped in of
+ * higher priority than current. If that's the case
+ * just reschedule current.
+ */
+ if (unlikely(next_task->prio < rq->curr->prio)) {
+ resched_task(rq->curr);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* find_lock_lowest_rq locks the rq if found */
+ lowest_rq = find_lock_lowest_rq(next_task, rq);
+ if (!lowest_rq) {
+ struct task_struct *task;
+ /*
+ * find lock_lowest_rq releases rq->lock
+ * so it is possible that next_task has changed.
+ * If it has, then try again.
+ */
+ task = pick_next_highest_task_rt(rq, -1);
+ if (unlikely(task != next_task) && task && paranoid--) {
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+ goto out;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, lowest_rq->cpu);
+ activate_task(lowest_rq, next_task, 0);
+
+ resched_task(lowest_rq->curr);
+
+ double_unlock_balance(rq, lowest_rq);
+
+ ret = 1;
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+/*
+ * TODO: Currently we just use the second highest prio task on
+ * the queue, and stop when it can't migrate (or there's
+ * no more RT tasks). There may be a case where a lower
+ * priority RT task has a different affinity than the
+ * higher RT task. In this case the lower RT task could
+ * possibly be able to migrate where as the higher priority
+ * RT task could not. We currently ignore this issue.
+ * Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+ /* push_rt_task will return true if it moved an RT */
+ while (push_rt_task(rq))
+ ;
+}
+
+static int pull_rt_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p, *next;
+ struct rq *src_rq;
+
+ if (likely(!rt_overloaded(this_rq)))
+ return 0;
+
+ next = pick_next_task_rt(this_rq);
+
+ for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+ /*
+ * We can potentially drop this_rq's lock in
+ * double_lock_balance, and another CPU could
+ * steal our next task - hence we must cause
+ * the caller to recalculate the next task
+ * in that case:
+ */
+ if (double_lock_balance(this_rq, src_rq)) {
+ struct task_struct *old_next = next;
+
+ next = pick_next_task_rt(this_rq);
+ if (next != old_next)
+ ret = 1;
+ }
+
+ /*
+ * Are there still pullable RT tasks?
+ */
+ if (src_rq->rt.rt_nr_running <= 1)
+ goto skip;
+
+ p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+ /*
+ * Do we have an RT task that preempts
+ * the to-be-scheduled task?
+ */
+ if (p && (!next || (p->prio < next->prio))) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!p->se.on_rq);
+
+ /*
+ * There's a chance that p is higher in priority
+ * than what's currently running on its cpu.
+ * This is just that p is wakeing up and hasn't
+ * had a chance to schedule. We only pull
+ * p if it is lower in priority than the
+ * current task on the run queue or
+ * this_rq next task is lower in prio than
+ * the current task on that rq.
+ */
+ if (p->prio < src_rq->curr->prio ||
+ (next && next->prio < src_rq->curr->prio))
+ goto skip;
+
+ ret = 1;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ /*
+ * We continue with the search, just in
+ * case there's an even higher prio task
+ * in another runqueue. (low likelyhood
+ * but possible)
+ *
+ * Update next so that we won't pick a task
+ * on another cpu with a priority lower (or equal)
+ * than the one we just picked.
+ */
+ next = p;
+
+ }
+ skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ return ret;
+}
+
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ pull_rt_task(rq);
+}
+
+static void post_schedule_rt(struct rq *rq)
+{
+ /*
+ * If we have more than one rt_task queued, then
+ * see if we can push the other rt_tasks off to other CPUS.
+ * Note we may release the rq lock, and since
+ * the lock was owned by prev, we need to release it
+ * first via finish_lock_switch and then reaquire it here.
+ */
+ if (unlikely(rq->rt.overloaded)) {
+ spin_lock_irq(&rq->lock);
+ push_rt_tasks(rq);
+ spin_unlock_irq(&rq->lock);
+ }
+}
+
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ rq->rt.overloaded)
+ push_rt_tasks(rq);
+}
+
+static unsigned long
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ /* don't touch RT tasks */
+ return 0;
+}
+
+static int
+move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ /* don't touch RT tasks */
+ return 0;
+}
+
+static void set_cpus_allowed_rt(struct task_struct *p,
+ const cpumask_t *new_mask)
+{
+ int weight = cpus_weight(*new_mask);
+
+ BUG_ON(!rt_task(p));
+
+ /*
+ * Update the migration status of the RQ if we have an RT task
+ * which is running AND changing its weight value.
+ */
+ if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+ struct rq *rq = task_rq(p);
+
+ if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
+ rq->rt.rt_nr_migratory++;
+ } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
+ BUG_ON(!rq->rt.rt_nr_migratory);
+ rq->rt.rt_nr_migratory--;
+ }
+
+ update_rt_migration(rq);
+ }
+
+ p->cpus_allowed = *new_mask;
+ p->rt.nr_cpus_allowed = weight;
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_set_overload(rq);
+
+ __enable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_clear_overload(rq);
+
+ __disable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ /*
+ * If there are other RT tasks then we will reschedule
+ * and the scheduling of the other RT tasks will handle
+ * the balancing. But if we are the last RT task
+ * we may need to handle the pulling of RT tasks
+ * now.
+ */
+ if (!rq->rt.rt_nr_running)
+ pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ int check_resched = 1;
+
+ /*
+ * If we are already running, then there's nothing
+ * that needs to be done. But if we are not running
+ * we may need to preempt the current running task.
+ * If that current running task is also an RT task
+ * then see if we can move to another run queue.
+ */
+ if (!running) {
+#ifdef CONFIG_SMP
+ if (rq->rt.overloaded && push_rt_task(rq) &&
+ /* Don't resched if we changed runqueues */
+ rq != task_rq(p))
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched && p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+ int oldprio, int running)
+{
+ if (running) {
+#ifdef CONFIG_SMP
+ /*
+ * If our priority decreases while running, we
+ * may need to pull tasks to this runqueue.
+ */
+ if (oldprio < p->prio)
+ pull_rt_task(rq);
+ /*
+ * If there's a higher priority task waiting to run
+ * then reschedule. Note, the above pull_rt_task
+ * can release the rq lock and p could migrate.
+ * Only reschedule if p is still on the same runqueue.
+ */
+ if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ resched_task(p);
+#else
+ /* For UP simply resched on drop of prio */
+ if (oldprio < p->prio)
+ resched_task(p);
+#endif /* CONFIG_SMP */
+ } else {
+ /*
+ * This task is not running, but if it is
+ * greater than the current running task
+ * then reschedule.
+ */
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
+}
+
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+ unsigned long soft, hard;
+
+ if (!p->signal)
+ return;
+
+ soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+ hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+
+ if (soft != RLIM_INFINITY) {
+ unsigned long next;
+
+ p->rt.timeout++;
+ next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+ if (p->rt.timeout > next)
+ p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ }
+}
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
+{
+ update_curr_rt(rq);
+
+ watchdog(rq, p);
+
+ /*
+ * RR tasks need a special form of timeslice management.
+ * FIFO tasks have no timeslices.
+ */
+ if (p->policy != SCHED_RR)
+ return;
+
+ if (--p->rt.time_slice)
+ return;
+
+ p->rt.time_slice = DEF_TIMESLICE;
+
+ /*
+ * Requeue to the end of queue if we are not the only element
+ * on the queue:
+ */
+ if (p->rt.run_list.prev != p->rt.run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ }
+}
+
+static void set_curr_task_rt(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq->clock;
+}
+
+static const struct sched_class rt_sched_class = {
+ .next = &fair_sched_class,
+ .enqueue_task = enqueue_task_rt,
+ .dequeue_task = dequeue_task_rt,
+ .yield_task = yield_task_rt,
+
+ .check_preempt_curr = check_preempt_curr_rt,
+
+ .pick_next_task = pick_next_task_rt,
+ .put_prev_task = put_prev_task_rt,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
+ .load_balance = load_balance_rt,
+ .move_one_task = move_one_task_rt,
+ .set_cpus_allowed = set_cpus_allowed_rt,
+ .rq_online = rq_online_rt,
+ .rq_offline = rq_offline_rt,
+ .pre_schedule = pre_schedule_rt,
+ .post_schedule = post_schedule_rt,
+ .task_wake_up = task_wake_up_rt,
+ .switched_from = switched_from_rt,
+#endif
+
+ .set_curr_task = set_curr_task_rt,
+ .task_tick = task_tick_rt,
+
+ .prio_changed = prio_changed_rt,
+ .switched_to = switched_to_rt,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+
+static void print_rt_stats(struct seq_file *m, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+ rcu_read_lock();
+ for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+ print_rt_rq(m, cpu, rt_rq);
+ rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 0000000..7dbf72a
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,372 @@
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 14
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+ int cpu;
+ int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+ char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+
+ if (mask_str == NULL)
+ return -ENOMEM;
+
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+ struct sched_domain *sd;
+ int dcount = 0;
+#endif
+
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_both_empty,
+ rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+ rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_sched_info.cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+ seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+ /* domain-specific stats */
+ preempt_disable();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+
+ cpumask_scnprintf(mask_str, mask_len, sd->span);
+ seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
+ }
+ seq_printf(seq,
+ " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
+ }
+ preempt_enable();
+#endif
+ }
+ kfree(mask_str);
+ return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+ char *buf = kmalloc(size, GFP_KERNEL);
+ struct seq_file *m;
+ int res;
+
+ if (!buf)
+ return -ENOMEM;
+ res = single_open(file, show_schedstat, NULL);
+ if (!res) {
+ m = file->private_data;
+ m->buf = buf;
+ m->size = size;
+ } else
+ kfree(buf);
+ return res;
+}
+
+static const struct file_operations proc_schedstat_operations = {
+ .open = schedstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_schedstat_init(void)
+{
+ proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ return 0;
+}
+module_init(proc_schedstat_init);
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta;
+ rq->rq_sched_info.pcount++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_sched_info.cpu_time += delta;
+}
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_sched_info.run_delay += delta;
+}
+# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val) do { var = (val); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field) do { } while (0)
+# define schedstat_add(rq, field, amt) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+ t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu. We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue. (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * Though we are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+ unsigned long long now = task_rq(t)->clock, delta = 0;
+
+ if (unlikely(sched_info_on()))
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+
+ rq_sched_info_dequeued(task_rq(t), delta);
+}
+
+/*
+ * Called when a task finally hits the cpu. We can now calculate how
+ * long it was waiting to run. We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+ unsigned long long now = task_rq(t)->clock, delta = 0;
+
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+ t->sched_info.last_arrival = now;
+ t->sched_info.pcount++;
+
+ rq_sched_info_arrive(task_rq(t), delta);
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array. The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu. Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set. It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+ if (unlikely(sched_info_on()))
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = task_rq(t)->clock;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+ unsigned long long delta = task_rq(t)->clock -
+ t->sched_info.last_arrival;
+
+ t->sched_info.cpu_time += delta;
+ rq_sched_info_depart(task_rq(t), delta);
+
+ if (t->state == TASK_RUNNING)
+ sched_info_queued(t);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice. (This may also be called when switching to or from
+ * the idle task.) We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ struct rq *rq = task_rq(prev);
+
+ /*
+ * prev now departs the cpu. It's not interesting to record
+ * stats about how efficient we were at scheduling the idle
+ * process, however.
+ */
+ if (prev != rq->idle)
+ sched_info_depart(prev);
+
+ if (next != rq->idle)
+ sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t) do { } while (0)
+#define sched_info_reset_dequeued(t) do { } while (0)
+#define sched_info_dequeued(t) do { } while (0)
+#define sched_info_switch(t, next) do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick. None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the utime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct signal_struct *sig;
+
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
+ return;
+
+ sig = tsk->signal;
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->utime = cputime_add(times->utime, cputime);
+ put_cpu_no_resched();
+ }
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the stime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct signal_struct *sig;
+
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
+ return;
+
+ sig = tsk->signal;
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->stime = cputime_add(times->stime, cputime);
+ put_cpu_no_resched();
+ }
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @ns: Time value by which to increment the sum_exec_runtime field
+ * of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+ unsigned long long ns)
+{
+ struct signal_struct *sig;
+
+ sig = tsk->signal;
+ /* see __exit_signal()->task_rq_unlock_wait() */
+ barrier();
+ if (unlikely(!sig))
+ return;
+
+ if (sig->cputime.totals) {
+ struct task_cputime *times;
+
+ times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+ times->sum_exec_runtime += ns;
+ put_cpu_no_resched();
+ }
+}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
new file mode 100644
index 0000000..57d4b13
--- /dev/null
+++ b/kernel/seccomp.c
@@ -0,0 +1,86 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+
+/* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 1
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+ __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+ 0, /* null terminated */
+};
+
+#ifdef CONFIG_COMPAT
+static int mode1_syscalls_32[] = {
+ __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
+ 0, /* null terminated */
+};
+#endif
+
+void __secure_computing(int this_syscall)
+{
+ int mode = current->seccomp.mode;
+ int * syscall;
+
+ switch (mode) {
+ case 1:
+ syscall = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ syscall = mode1_syscalls_32;
+#endif
+ do {
+ if (*syscall == this_syscall)
+ return;
+ } while (*++syscall);
+ break;
+ default:
+ BUG();
+ }
+
+#ifdef SECCOMP_DEBUG
+ dump_stack();
+#endif
+ do_exit(SIGKILL);
+}
+
+long prctl_get_seccomp(void)
+{
+ return current->seccomp.mode;
+}
+
+long prctl_set_seccomp(unsigned long seccomp_mode)
+{
+ long ret;
+
+ /* can set it only once to be even more secure */
+ ret = -EPERM;
+ if (unlikely(current->seccomp.mode))
+ goto out;
+
+ ret = -EINVAL;
+ if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ ret = 0;
+ }
+
+ out:
+ return ret;
+}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
new file mode 100644
index 0000000..94a62c0
--- /dev/null
+++ b/kernel/semaphore.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock. It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore. If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/ftrace.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore. If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ __down(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
+int down_interruptible(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_interruptible(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR. If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
+int down_killable(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_killable(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_killable);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically. Returns 0 if the mutex has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock! Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+ unsigned long flags;
+ int count;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ count = sem->count - 1;
+ if (likely(count >= 0))
+ sem->count = count;
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @jiffies: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME. It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long jiffies)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_timeout(sem, jiffies);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore. Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(list_empty(&sem->wait_list)))
+ sem->count++;
+ else
+ __up(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ int up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler. Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ struct task_struct *task = current;
+ struct semaphore_waiter waiter;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+ waiter.task = task;
+ waiter.up = 0;
+
+ for (;;) {
+ if (signal_pending_state(state, task))
+ goto interrupted;
+ if (timeout <= 0)
+ goto timed_out;
+ __set_task_state(task, state);
+ spin_unlock_irq(&sem->lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&sem->lock);
+ if (waiter.up)
+ return 0;
+ }
+
+ timed_out:
+ list_del(&waiter.list);
+ return -ETIME;
+
+ interrupted:
+ list_del(&waiter.list);
+ return -EINTR;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+ __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+{
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+ struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+ struct semaphore_waiter, list);
+ list_del(&waiter->list);
+ waiter->up = 1;
+ wake_up_process(waiter->task);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
new file mode 100644
index 0000000..28859a9
--- /dev/null
+++ b/kernel/signal.c
@@ -0,0 +1,2592 @@
+/*
+ * linux/kernel/signal.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
+ *
+ * 2003-06-02 Jim Houston - Concurrent Computer Corp.
+ * Changes to use preallocated sigqueue structures
+ * to allow signals to be sent reliably.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+#include <linux/signalfd.h>
+#include <linux/tracehook.h>
+#include <linux/capability.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+#include <trace/sched.h>
+
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/siginfo.h>
+#include "audit.h" /* audit_signal_info() */
+
+/*
+ * SLAB caches for signal bits.
+ */
+
+static struct kmem_cache *sigqueue_cachep;
+
+static void __user *sig_handler(struct task_struct *t, int sig)
+{
+ return t->sighand->action[sig - 1].sa.sa_handler;
+}
+
+static int sig_handler_ignored(void __user *handler, int sig)
+{
+ /* Is it explicitly or implicitly ignored? */
+ return handler == SIG_IGN ||
+ (handler == SIG_DFL && sig_kernel_ignore(sig));
+}
+
+static int sig_ignored(struct task_struct *t, int sig)
+{
+ void __user *handler;
+
+ /*
+ * Blocked signals are never ignored, since the
+ * signal handler may change by the time it is
+ * unblocked.
+ */
+ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
+ return 0;
+
+ handler = sig_handler(t, sig);
+ if (!sig_handler_ignored(handler, sig))
+ return 0;
+
+ /*
+ * Tracers may want to know about even ignored signals.
+ */
+ return !tracehook_consider_ignored_signal(t, sig, handler);
+}
+
+/*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+{
+ unsigned long ready;
+ long i;
+
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+ ready |= signal->sig[i] &~ blocked->sig[i];
+ break;
+
+ case 4: ready = signal->sig[3] &~ blocked->sig[3];
+ ready |= signal->sig[2] &~ blocked->sig[2];
+ ready |= signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 2: ready = signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 1: ready = signal->sig[0] &~ blocked->sig[0];
+ }
+ return ready != 0;
+}
+
+#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
+
+static int recalc_sigpending_tsk(struct task_struct *t)
+{
+ if (t->signal->group_stop_count > 0 ||
+ PENDING(&t->pending, &t->blocked) ||
+ PENDING(&t->signal->shared_pending, &t->blocked)) {
+ set_tsk_thread_flag(t, TIF_SIGPENDING);
+ return 1;
+ }
+ /*
+ * We must never clear the flag in another thread, or in current
+ * when it's possible the current syscall is returning -ERESTART*.
+ * So we don't clear it here, and only callers who know they should do.
+ */
+ return 0;
+}
+
+/*
+ * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
+ * This is superfluous when called on current, the wakeup is a harmless no-op.
+ */
+void recalc_sigpending_and_wake(struct task_struct *t)
+{
+ if (recalc_sigpending_tsk(t))
+ signal_wake_up(t, 0);
+}
+
+void recalc_sigpending(void)
+{
+ if (unlikely(tracehook_force_sigpending()))
+ set_thread_flag(TIF_SIGPENDING);
+ else if (!recalc_sigpending_tsk(current) && !freezing(current))
+ clear_thread_flag(TIF_SIGPENDING);
+
+}
+
+/* Given the mask, find the first available signal that should be serviced. */
+
+int next_signal(struct sigpending *pending, sigset_t *mask)
+{
+ unsigned long i, *s, *m, x;
+ int sig = 0;
+
+ s = pending->signal.sig;
+ m = mask->sig;
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+ if ((x = *s &~ *m) != 0) {
+ sig = ffz(~x) + i*_NSIG_BPW + 1;
+ break;
+ }
+ break;
+
+ case 2: if ((x = s[0] &~ m[0]) != 0)
+ sig = 1;
+ else if ((x = s[1] &~ m[1]) != 0)
+ sig = _NSIG_BPW + 1;
+ else
+ break;
+ sig += ffz(~x);
+ break;
+
+ case 1: if ((x = *s &~ *m) != 0)
+ sig = ffz(~x) + 1;
+ break;
+ }
+
+ return sig;
+}
+
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+ int override_rlimit)
+{
+ struct sigqueue *q = NULL;
+ struct user_struct *user;
+
+ /*
+ * In order to avoid problems with "switch_user()", we want to make
+ * sure that the compiler doesn't re-load "t->user"
+ */
+ user = t->user;
+ barrier();
+ atomic_inc(&user->sigpending);
+ if (override_rlimit ||
+ atomic_read(&user->sigpending) <=
+ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+ q = kmem_cache_alloc(sigqueue_cachep, flags);
+ if (unlikely(q == NULL)) {
+ atomic_dec(&user->sigpending);
+ } else {
+ INIT_LIST_HEAD(&q->list);
+ q->flags = 0;
+ q->user = get_uid(user);
+ }
+ return(q);
+}
+
+static void __sigqueue_free(struct sigqueue *q)
+{
+ if (q->flags & SIGQUEUE_PREALLOC)
+ return;
+ atomic_dec(&q->user->sigpending);
+ free_uid(q->user);
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+void flush_sigqueue(struct sigpending *queue)
+{
+ struct sigqueue *q;
+
+ sigemptyset(&queue->signal);
+ while (!list_empty(&queue->list)) {
+ q = list_entry(queue->list.next, struct sigqueue , list);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+}
+
+/*
+ * Flush all pending signals for a task.
+ */
+void flush_signals(struct task_struct *t)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&t->sighand->siglock, flags);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
+ spin_unlock_irqrestore(&t->sighand->siglock, flags);
+}
+
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+ sigset_t signal, retain;
+ struct sigqueue *q, *n;
+
+ signal = pending->signal;
+ sigemptyset(&retain);
+
+ list_for_each_entry_safe(q, n, &pending->list, list) {
+ int sig = q->info.si_signo;
+
+ if (likely(q->info.si_code != SI_TIMER)) {
+ sigaddset(&retain, sig);
+ } else {
+ sigdelset(&signal, sig);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+
+ sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ __flush_itimer_signals(&tsk->pending);
+ __flush_itimer_signals(&tsk->signal->shared_pending);
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
+void ignore_signals(struct task_struct *t)
+{
+ int i;
+
+ for (i = 0; i < _NSIG; ++i)
+ t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+ flush_signals(t);
+}
+
+/*
+ * Flush all handlers for a task.
+ */
+
+void
+flush_signal_handlers(struct task_struct *t, int force_default)
+{
+ int i;
+ struct k_sigaction *ka = &t->sighand->action[0];
+ for (i = _NSIG ; i != 0 ; i--) {
+ if (force_default || ka->sa.sa_handler != SIG_IGN)
+ ka->sa.sa_handler = SIG_DFL;
+ ka->sa.sa_flags = 0;
+ sigemptyset(&ka->sa.sa_mask);
+ ka++;
+ }
+}
+
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+ void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
+ if (is_global_init(tsk))
+ return 1;
+ if (handler != SIG_IGN && handler != SIG_DFL)
+ return 0;
+ return !tracehook_consider_fatal_signal(tsk, sig, handler);
+}
+
+
+/* Notify the system that a driver wants to block all signals for this
+ * process, and wants to be notified if any signals at all were to be
+ * sent/acted upon. If the notifier routine returns non-zero, then the
+ * signal will be acted upon after all. If the notifier routine returns 0,
+ * then then signal will be blocked. Only one block per process is
+ * allowed. priv is a pointer to private data that the notifier routine
+ * can use to determine if the signal should be blocked or not. */
+
+void
+block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->notifier_mask = mask;
+ current->notifier_data = priv;
+ current->notifier = notifier;
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+/* Notify the system that blocking has ended. */
+
+void
+unblock_all_signals(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->notifier = NULL;
+ current->notifier_data = NULL;
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+{
+ struct sigqueue *q, *first = NULL;
+
+ /*
+ * Collect the siginfo appropriate to this signal. Check if
+ * there is another siginfo for the same signal.
+ */
+ list_for_each_entry(q, &list->list, list) {
+ if (q->info.si_signo == sig) {
+ if (first)
+ goto still_pending;
+ first = q;
+ }
+ }
+
+ sigdelset(&list->signal, sig);
+
+ if (first) {
+still_pending:
+ list_del_init(&first->list);
+ copy_siginfo(info, &first->info);
+ __sigqueue_free(first);
+ } else {
+ /* Ok, it wasn't in the queue. This must be
+ a fast-pathed signal or we must have been
+ out of queue space. So zero out the info.
+ */
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = 0;
+ info->si_pid = 0;
+ info->si_uid = 0;
+ }
+}
+
+static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+ siginfo_t *info)
+{
+ int sig = next_signal(pending, mask);
+
+ if (sig) {
+ if (current->notifier) {
+ if (sigismember(current->notifier_mask, sig)) {
+ if (!(current->notifier)(current->notifier_data)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ return 0;
+ }
+ }
+ }
+
+ collect_signal(sig, pending, info);
+ }
+
+ return sig;
+}
+
+/*
+ * Dequeue a signal and return the element to the caller, which is
+ * expected to free it.
+ *
+ * All callers have to hold the siglock.
+ */
+int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+ int signr;
+
+ /* We only dequeue private signals from ourselves, we don't let
+ * signalfd steal them
+ */
+ signr = __dequeue_signal(&tsk->pending, mask, info);
+ if (!signr) {
+ signr = __dequeue_signal(&tsk->signal->shared_pending,
+ mask, info);
+ /*
+ * itimer signal ?
+ *
+ * itimers are process shared and we restart periodic
+ * itimers in the signal delivery path to prevent DoS
+ * attacks in the high resolution timer case. This is
+ * compliant with the old way of self restarting
+ * itimers, as the SIGALRM is a legacy signal and only
+ * queued once. Changing the restart behaviour to
+ * restart the timer in the signal dequeue path is
+ * reducing the timer noise on heavy loaded !highres
+ * systems too.
+ */
+ if (unlikely(signr == SIGALRM)) {
+ struct hrtimer *tmr = &tsk->signal->real_timer;
+
+ if (!hrtimer_is_queued(tmr) &&
+ tsk->signal->it_real_incr.tv64 != 0) {
+ hrtimer_forward(tmr, tmr->base->get_time(),
+ tsk->signal->it_real_incr);
+ hrtimer_restart(tmr);
+ }
+ }
+ }
+
+ recalc_sigpending();
+ if (!signr)
+ return 0;
+
+ if (unlikely(sig_kernel_stop(signr))) {
+ /*
+ * Set a marker that we have dequeued a stop signal. Our
+ * caller might release the siglock and then the pending
+ * stop signal it is about to process is no longer in the
+ * pending bitmasks, but must still be cleared by a SIGCONT
+ * (and overruled by a SIGKILL). So those cases clear this
+ * shared flag after we've set it. Note that this flag may
+ * remain set after the signal we return is ignored or
+ * handled. That doesn't matter because its only purpose
+ * is to alert stop-signal processing code when another
+ * processor has come along and cleared the flag.
+ */
+ tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ }
+ if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ /*
+ * Release the siglock to ensure proper locking order
+ * of timer locks outside of siglocks. Note, we leave
+ * irqs disabled here, since the posix-timers code is
+ * about to disable them again anyway.
+ */
+ spin_unlock(&tsk->sighand->siglock);
+ do_schedule_next_timer(info);
+ spin_lock(&tsk->sighand->siglock);
+ }
+ return signr;
+}
+
+/*
+ * Tell a process that it has a new active signal..
+ *
+ * NOTE! we rely on the previous spin_lock to
+ * lock interrupts for us! We can only be called with
+ * "siglock" held, and the local interrupt must
+ * have been disabled when that got acquired!
+ *
+ * No need to set need_resched since signal event passing
+ * goes through ->blocked
+ */
+void signal_wake_up(struct task_struct *t, int resume)
+{
+ unsigned int mask;
+
+ set_tsk_thread_flag(t, TIF_SIGPENDING);
+
+ /*
+ * For SIGKILL, we want to wake it up in the stopped/traced/killable
+ * case. We don't check t->state here because there is a race with it
+ * executing another processor and just now entering stopped state.
+ * By using wake_up_state, we ensure the process will wake up and
+ * handle its death signal.
+ */
+ mask = TASK_INTERRUPTIBLE;
+ if (resume)
+ mask |= TASK_WAKEKILL;
+ if (!wake_up_state(t, mask))
+ kick_process(t);
+}
+
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ *
+ * This version takes a sigset mask and looks at all signals,
+ * not just those in the first mask word.
+ */
+static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+{
+ struct sigqueue *q, *n;
+ sigset_t m;
+
+ sigandsets(&m, mask, &s->signal);
+ if (sigisemptyset(&m))
+ return 0;
+
+ signandsets(&s->signal, &s->signal, mask);
+ list_for_each_entry_safe(q, n, &s->list, list) {
+ if (sigismember(mask, q->info.si_signo)) {
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+ return 1;
+}
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ */
+static int rm_from_queue(unsigned long mask, struct sigpending *s)
+{
+ struct sigqueue *q, *n;
+
+ if (!sigtestsetmask(&s->signal, mask))
+ return 0;
+
+ sigdelsetmask(&s->signal, mask);
+ list_for_each_entry_safe(q, n, &s->list, list) {
+ if (q->info.si_signo < SIGRTMIN &&
+ (mask & sigmask(q->info.si_signo))) {
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+ return 1;
+}
+
+/*
+ * Bad permissions for sending the signal
+ */
+static int check_kill_permission(int sig, struct siginfo *info,
+ struct task_struct *t)
+{
+ struct pid *sid;
+ int error;
+
+ if (!valid_signal(sig))
+ return -EINVAL;
+
+ if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
+ return 0;
+
+ error = audit_signal_info(sig, t); /* Let audit system see the signal */
+ if (error)
+ return error;
+
+ if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
+ (current->uid ^ t->suid) && (current->uid ^ t->uid) &&
+ !capable(CAP_KILL)) {
+ switch (sig) {
+ case SIGCONT:
+ sid = task_session(t);
+ /*
+ * We don't return the error if sid == NULL. The
+ * task was unhashed, the caller must notice this.
+ */
+ if (!sid || sid == task_session(current))
+ break;
+ default:
+ return -EPERM;
+ }
+ }
+
+ return security_task_kill(t, info, sig, 0);
+}
+
+/*
+ * Handle magic process-wide effects of stop/continue signals. Unlike
+ * the signal actions, these happen immediately at signal-generation
+ * time regardless of blocking, ignoring, or handling. This does the
+ * actual continuing for SIGCONT, but not the actual stopping for stop
+ * signals. The process stop is done as a signal action for SIG_DFL.
+ *
+ * Returns true if the signal should be actually delivered, otherwise
+ * it should be dropped.
+ */
+static int prepare_signal(int sig, struct task_struct *p)
+{
+ struct signal_struct *signal = p->signal;
+ struct task_struct *t;
+
+ if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
+ /*
+ * The process is in the middle of dying, nothing to do.
+ */
+ } else if (sig_kernel_stop(sig)) {
+ /*
+ * This is a stop signal. Remove SIGCONT from all queues.
+ */
+ rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
+ t = p;
+ do {
+ rm_from_queue(sigmask(SIGCONT), &t->pending);
+ } while_each_thread(p, t);
+ } else if (sig == SIGCONT) {
+ unsigned int why;
+ /*
+ * Remove all stop signals from all queues,
+ * and wake all threads.
+ */
+ rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
+ t = p;
+ do {
+ unsigned int state;
+ rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
+ /*
+ * If there is a handler for SIGCONT, we must make
+ * sure that no thread returns to user mode before
+ * we post the signal, in case it was the only
+ * thread eligible to run the signal handler--then
+ * it must not do anything between resuming and
+ * running the handler. With the TIF_SIGPENDING
+ * flag set, the thread will pause and acquire the
+ * siglock that we hold now and until we've queued
+ * the pending signal.
+ *
+ * Wake up the stopped thread _after_ setting
+ * TIF_SIGPENDING
+ */
+ state = __TASK_STOPPED;
+ if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
+ set_tsk_thread_flag(t, TIF_SIGPENDING);
+ state |= TASK_INTERRUPTIBLE;
+ }
+ wake_up_state(t, state);
+ } while_each_thread(p, t);
+
+ /*
+ * Notify the parent with CLD_CONTINUED if we were stopped.
+ *
+ * If we were in the middle of a group stop, we pretend it
+ * was already finished, and then continued. Since SIGCHLD
+ * doesn't queue we report only CLD_STOPPED, as if the next
+ * CLD_CONTINUED was dropped.
+ */
+ why = 0;
+ if (signal->flags & SIGNAL_STOP_STOPPED)
+ why |= SIGNAL_CLD_CONTINUED;
+ else if (signal->group_stop_count)
+ why |= SIGNAL_CLD_STOPPED;
+
+ if (why) {
+ /*
+ * The first thread which returns from finish_stop()
+ * will take ->siglock, notice SIGNAL_CLD_MASK, and
+ * notify its parent. See get_signal_to_deliver().
+ */
+ signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal->group_stop_count = 0;
+ signal->group_exit_code = 0;
+ } else {
+ /*
+ * We are not stopped, but there could be a stop
+ * signal in the middle of being processed after
+ * being removed from the queue. Clear that too.
+ */
+ signal->flags &= ~SIGNAL_STOP_DEQUEUED;
+ }
+ }
+
+ return !sig_ignored(p, sig);
+}
+
+/*
+ * Test if P wants to take SIG. After we've checked all threads with this,
+ * it's equivalent to finding no threads not blocking SIG. Any threads not
+ * blocking SIG were ruled out because they are not running and already
+ * have pending signals. Such threads will dequeue from the shared queue
+ * as soon as they're available, so putting the signal on the shared queue
+ * will be equivalent to sending it to one such thread.
+ */
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+ if (sigismember(&p->blocked, sig))
+ return 0;
+ if (p->flags & PF_EXITING)
+ return 0;
+ if (sig == SIGKILL)
+ return 1;
+ if (task_is_stopped_or_traced(p))
+ return 0;
+ return task_curr(p) || !signal_pending(p);
+}
+
+static void complete_signal(int sig, struct task_struct *p, int group)
+{
+ struct signal_struct *signal = p->signal;
+ struct task_struct *t;
+
+ /*
+ * Now find a thread we can wake up to take the signal off the queue.
+ *
+ * If the main thread wants the signal, it gets first crack.
+ * Probably the least surprising to the average bear.
+ */
+ if (wants_signal(sig, p))
+ t = p;
+ else if (!group || thread_group_empty(p))
+ /*
+ * There is just one thread and it does not need to be woken.
+ * It will dequeue unblocked signals before it runs again.
+ */
+ return;
+ else {
+ /*
+ * Otherwise try to find a suitable thread.
+ */
+ t = signal->curr_target;
+ while (!wants_signal(sig, t)) {
+ t = next_thread(t);
+ if (t == signal->curr_target)
+ /*
+ * No thread needs to be woken.
+ * Any eligible threads will see
+ * the signal in the queue soon.
+ */
+ return;
+ }
+ signal->curr_target = t;
+ }
+
+ /*
+ * Found a killable thread. If the signal will be fatal,
+ * then start taking the whole group down immediately.
+ */
+ if (sig_fatal(p, sig) &&
+ !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !sigismember(&t->real_blocked, sig) &&
+ (sig == SIGKILL ||
+ !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+ /*
+ * This signal will be fatal to the whole group.
+ */
+ if (!sig_kernel_coredump(sig)) {
+ /*
+ * Start a group exit and wake everybody up.
+ * This way we don't have other threads
+ * running and doing things after a slower
+ * thread has the fatal signal pending.
+ */
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = sig;
+ signal->group_stop_count = 0;
+ t = p;
+ do {
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+ } while_each_thread(p, t);
+ return;
+ }
+ }
+
+ /*
+ * The signal is already in the shared-pending queue.
+ * Tell the chosen thread to wake up and dequeue it.
+ */
+ signal_wake_up(t, sig == SIGKILL);
+ return;
+}
+
+static inline int legacy_queue(struct sigpending *signals, int sig)
+{
+ return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
+}
+
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+ int group)
+{
+ struct sigpending *pending;
+ struct sigqueue *q;
+
+ trace_sched_signal_send(sig, t);
+
+ assert_spin_locked(&t->sighand->siglock);
+ if (!prepare_signal(sig, t))
+ return 0;
+
+ pending = group ? &t->signal->shared_pending : &t->pending;
+ /*
+ * Short-circuit ignored signals and support queuing
+ * exactly one non-rt signal, so that we can get more
+ * detailed information about the cause of the signal.
+ */
+ if (legacy_queue(pending, sig))
+ return 0;
+ /*
+ * fast-pathed signals for kernel-internal things like SIGSTOP
+ * or SIGKILL.
+ */
+ if (info == SEND_SIG_FORCED)
+ goto out_set;
+
+ /* Real-time signals must be queued if sent by sigqueue, or
+ some other real-time mechanism. It is implementation
+ defined whether kill() does so. We attempt to do so, on
+ the principle of least surprise, but since kill is not
+ allowed to fail with EAGAIN when low on memory we just
+ make sure at least one signal gets delivered and don't
+ pass on the info struct. */
+
+ q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+ (is_si_special(info) ||
+ info->si_code >= 0)));
+ if (q) {
+ list_add_tail(&q->list, &pending->list);
+ switch ((unsigned long) info) {
+ case (unsigned long) SEND_SIG_NOINFO:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_USER;
+ q->info.si_pid = task_pid_vnr(current);
+ q->info.si_uid = current->uid;
+ break;
+ case (unsigned long) SEND_SIG_PRIV:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_KERNEL;
+ q->info.si_pid = 0;
+ q->info.si_uid = 0;
+ break;
+ default:
+ copy_siginfo(&q->info, info);
+ break;
+ }
+ } else if (!is_si_special(info)) {
+ if (sig >= SIGRTMIN && info->si_code != SI_USER)
+ /*
+ * Queue overflow, abort. We may abort if the signal was rt
+ * and sent by user using something other than kill().
+ */
+ return -EAGAIN;
+ }
+
+out_set:
+ signalfd_notify(t, sig);
+ sigaddset(&pending->signal, sig);
+ complete_signal(sig, t, group);
+ return 0;
+}
+
+int print_fatal_signals;
+
+static void print_fatal_signal(struct pt_regs *regs, int signr)
+{
+ printk("%s/%d: potentially unexpected fatal signal %d.\n",
+ current->comm, task_pid_nr(current), signr);
+
+#if defined(__i386__) && !defined(__arch_um__)
+ printk("code at %08lx: ", regs->ip);
+ {
+ int i;
+ for (i = 0; i < 16; i++) {
+ unsigned char insn;
+
+ __get_user(insn, (unsigned char *)(regs->ip + i));
+ printk("%02x ", insn);
+ }
+ }
+#endif
+ printk("\n");
+ show_regs(regs);
+}
+
+static int __init setup_print_fatal_signals(char *str)
+{
+ get_option (&str, &print_fatal_signals);
+
+ return 1;
+}
+
+__setup("print-fatal-signals=", setup_print_fatal_signals);
+
+int
+__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ return send_signal(sig, info, p, 1);
+}
+
+static int
+specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ return send_signal(sig, info, t, 0);
+}
+
+/*
+ * Force a signal that the process can't ignore: if necessary
+ * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example,
+ * that is why we also clear SIGNAL_UNKILLABLE.
+ */
+int
+force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ unsigned long int flags;
+ int ret, blocked, ignored;
+ struct k_sigaction *action;
+
+ spin_lock_irqsave(&t->sighand->siglock, flags);
+ action = &t->sighand->action[sig-1];
+ ignored = action->sa.sa_handler == SIG_IGN;
+ blocked = sigismember(&t->blocked, sig);
+ if (blocked || ignored) {
+ action->sa.sa_handler = SIG_DFL;
+ if (blocked) {
+ sigdelset(&t->blocked, sig);
+ recalc_sigpending_and_wake(t);
+ }
+ }
+ if (action->sa.sa_handler == SIG_DFL)
+ t->signal->flags &= ~SIGNAL_UNKILLABLE;
+ ret = specific_send_sig_info(sig, info, t);
+ spin_unlock_irqrestore(&t->sighand->siglock, flags);
+
+ return ret;
+}
+
+void
+force_sig_specific(int sig, struct task_struct *t)
+{
+ force_sig_info(sig, SEND_SIG_FORCED, t);
+}
+
+/*
+ * Nuke all other threads in the group.
+ */
+void zap_other_threads(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ p->signal->group_stop_count = 0;
+
+ for (t = next_thread(p); t != p; t = next_thread(t)) {
+ /*
+ * Don't bother with already dead threads
+ */
+ if (t->exit_state)
+ continue;
+
+ /* SIGKILL will be handled before any pending SIGSTOP */
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+ }
+}
+
+int __fatal_signal_pending(struct task_struct *tsk)
+{
+ return sigismember(&tsk->pending.signal, SIGKILL);
+}
+EXPORT_SYMBOL(__fatal_signal_pending);
+
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+ struct sighand_struct *sighand;
+
+ rcu_read_lock();
+ for (;;) {
+ sighand = rcu_dereference(tsk->sighand);
+ if (unlikely(sighand == NULL))
+ break;
+
+ spin_lock_irqsave(&sighand->siglock, *flags);
+ if (likely(sighand == tsk->sighand))
+ break;
+ spin_unlock_irqrestore(&sighand->siglock, *flags);
+ }
+ rcu_read_unlock();
+
+ return sighand;
+}
+
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = check_kill_permission(sig, info, p);
+
+ if (!ret && sig) {
+ ret = -ESRCH;
+ if (lock_task_sighand(p, &flags)) {
+ ret = __group_send_sig_info(sig, info, p);
+ unlock_task_sighand(p, &flags);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * __kill_pgrp_info() sends a signal to a process group: this is what the tty
+ * control characters do (^C, ^Z etc)
+ */
+
+int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
+{
+ struct task_struct *p = NULL;
+ int retval, success;
+
+ success = 0;
+ retval = -ESRCH;
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ int err = group_send_sig_info(sig, info, p);
+ success |= !err;
+ retval = err;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ return success ? 0 : retval;
+}
+
+int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
+{
+ int error = -ESRCH;
+ struct task_struct *p;
+
+ rcu_read_lock();
+retry:
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p) {
+ error = group_send_sig_info(sig, info, p);
+ if (unlikely(error == -ESRCH))
+ /*
+ * The task was unhashed in between, try again.
+ * If it is dead, pid_task() will return NULL,
+ * if we race with de_thread() it will find the
+ * new leader.
+ */
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ return error;
+}
+
+int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int error;
+ rcu_read_lock();
+ error = kill_pid_info(sig, info, find_vpid(pid));
+ rcu_read_unlock();
+ return error;
+}
+
+/* like kill_pid_info(), but doesn't use uid/euid of "current" */
+int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
+ uid_t uid, uid_t euid, u32 secid)
+{
+ int ret = -EINVAL;
+ struct task_struct *p;
+
+ if (!valid_signal(sig))
+ return ret;
+
+ read_lock(&tasklist_lock);
+ p = pid_task(pid, PIDTYPE_PID);
+ if (!p) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+ if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
+ && (euid != p->suid) && (euid != p->uid)
+ && (uid != p->suid) && (uid != p->uid)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = security_task_kill(p, info, sig, secid);
+ if (ret)
+ goto out_unlock;
+ if (sig && p->sighand) {
+ unsigned long flags;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ ret = __group_send_sig_info(sig, info, p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+
+/*
+ * kill_something_info() interprets pid in interesting ways just like kill(2).
+ *
+ * POSIX specifies that kill(-1,sig) is unspecified, but what we have
+ * is probably wrong. Should make it like BSD or SYSV.
+ */
+
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int ret;
+
+ if (pid > 0) {
+ rcu_read_lock();
+ ret = kill_pid_info(sig, info, find_vpid(pid));
+ rcu_read_unlock();
+ return ret;
+ }
+
+ read_lock(&tasklist_lock);
+ if (pid != -1) {
+ ret = __kill_pgrp_info(sig, info,
+ pid ? find_vpid(-pid) : task_pgrp(current));
+ } else {
+ int retval = 0, count = 0;
+ struct task_struct * p;
+
+ for_each_process(p) {
+ if (task_pid_vnr(p) > 1 &&
+ !same_thread_group(p, current)) {
+ int err = group_send_sig_info(sig, info, p);
+ ++count;
+ if (err != -EPERM)
+ retval = err;
+ }
+ }
+ ret = count ? retval : -ESRCH;
+ }
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * These are for backward compatibility with the rest of the kernel source.
+ */
+
+/*
+ * The caller must ensure the task can't exit.
+ */
+int
+send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ int ret;
+ unsigned long flags;
+
+ /*
+ * Make sure legacy kernel users don't send in bad values
+ * (normal paths check this in check_kill_permission).
+ */
+ if (!valid_signal(sig))
+ return -EINVAL;
+
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ ret = specific_send_sig_info(sig, info, p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ return ret;
+}
+
+#define __si_special(priv) \
+ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
+int
+send_sig(int sig, struct task_struct *p, int priv)
+{
+ return send_sig_info(sig, __si_special(priv), p);
+}
+
+void
+force_sig(int sig, struct task_struct *p)
+{
+ force_sig_info(sig, SEND_SIG_PRIV, p);
+}
+
+/*
+ * When things go south during signal handling, we
+ * will force a SIGSEGV. And if the signal that caused
+ * the problem was already a SIGSEGV, we'll want to
+ * make sure we don't even try to deliver the signal..
+ */
+int
+force_sigsegv(int sig, struct task_struct *p)
+{
+ if (sig == SIGSEGV) {
+ unsigned long flags;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+ force_sig(SIGSEGV, p);
+ return 0;
+}
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ int ret;
+
+ read_lock(&tasklist_lock);
+ ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(kill_pgrp);
+
+int kill_pid(struct pid *pid, int sig, int priv)
+{
+ return kill_pid_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pid);
+
+/*
+ * These functions support sending signals using preallocated sigqueue
+ * structures. This is needed "because realtime applications cannot
+ * afford to lose notifications of asynchronous events, like timer
+ * expirations or I/O completions". In the case of Posix Timers
+ * we allocate the sigqueue structure from the timer_create. If this
+ * allocation fails we are able to report the failure to the application
+ * with an EAGAIN error.
+ */
+
+struct sigqueue *sigqueue_alloc(void)
+{
+ struct sigqueue *q;
+
+ if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0)))
+ q->flags |= SIGQUEUE_PREALLOC;
+ return(q);
+}
+
+void sigqueue_free(struct sigqueue *q)
+{
+ unsigned long flags;
+ spinlock_t *lock = &current->sighand->siglock;
+
+ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ /*
+ * We must hold ->siglock while testing q->list
+ * to serialize with collect_signal() or with
+ * __exit_signal()->flush_sigqueue().
+ */
+ spin_lock_irqsave(lock, flags);
+ q->flags &= ~SIGQUEUE_PREALLOC;
+ /*
+ * If it is queued it will be freed when dequeued,
+ * like the "regular" sigqueue.
+ */
+ if (!list_empty(&q->list))
+ q = NULL;
+ spin_unlock_irqrestore(lock, flags);
+
+ if (q)
+ __sigqueue_free(q);
+}
+
+int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+{
+ int sig = q->info.si_signo;
+ struct sigpending *pending;
+ unsigned long flags;
+ int ret;
+
+ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
+ ret = -1;
+ if (!likely(lock_task_sighand(t, &flags)))
+ goto ret;
+
+ ret = 1; /* the signal is ignored */
+ if (!prepare_signal(sig, t))
+ goto out;
+
+ ret = 0;
+ if (unlikely(!list_empty(&q->list))) {
+ /*
+ * If an SI_TIMER entry is already queue just increment
+ * the overrun count.
+ */
+ BUG_ON(q->info.si_code != SI_TIMER);
+ q->info.si_overrun++;
+ goto out;
+ }
+ q->info.si_overrun = 0;
+
+ signalfd_notify(t, sig);
+ pending = group ? &t->signal->shared_pending : &t->pending;
+ list_add_tail(&q->list, &pending->list);
+ sigaddset(&pending->signal, sig);
+ complete_signal(sig, t, group);
+out:
+ unlock_task_sighand(t, &flags);
+ret:
+ return ret;
+}
+
+/*
+ * Wake up any threads in the parent blocked in wait* syscalls.
+ */
+static inline void __wake_up_parent(struct task_struct *p,
+ struct task_struct *parent)
+{
+ wake_up_interruptible_sync(&parent->signal->wait_chldexit);
+}
+
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+ struct siginfo info;
+ unsigned long flags;
+ struct sighand_struct *psig;
+ struct task_cputime cputime;
+ int ret = sig;
+
+ BUG_ON(sig == -1);
+
+ /* do_notify_parent_cldstop should have been called instead. */
+ BUG_ON(task_is_stopped_or_traced(tsk));
+
+ BUG_ON(!tsk->ptrace &&
+ (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ /*
+ * we are under tasklist_lock here so our parent is tied to
+ * us and cannot exit and release its namespace.
+ *
+ * the only it can is to switch its nsproxy with sys_unshare,
+ * bu uncharing pid namespaces is not allowed, so we'll always
+ * see relevant namespace
+ *
+ * write_lock() currently calls preempt_disable() which is the
+ * same as rcu_read_lock(), but according to Oleg, this is not
+ * correct to rely on this
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ rcu_read_unlock();
+
+ info.si_uid = tsk->uid;
+
+ thread_group_cputime(tsk, &cputime);
+ info.si_utime = cputime_to_jiffies(cputime.utime);
+ info.si_stime = cputime_to_jiffies(cputime.stime);
+
+ info.si_status = tsk->exit_code & 0x7f;
+ if (tsk->exit_code & 0x80)
+ info.si_code = CLD_DUMPED;
+ else if (tsk->exit_code & 0x7f)
+ info.si_code = CLD_KILLED;
+ else {
+ info.si_code = CLD_EXITED;
+ info.si_status = tsk->exit_code >> 8;
+ }
+
+ psig = tsk->parent->sighand;
+ spin_lock_irqsave(&psig->siglock, flags);
+ if (!tsk->ptrace && sig == SIGCHLD &&
+ (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+ (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
+ /*
+ * We are exiting and our parent doesn't care. POSIX.1
+ * defines special semantics for setting SIGCHLD to SIG_IGN
+ * or setting the SA_NOCLDWAIT flag: we should be reaped
+ * automatically and not left for our parent's wait4 call.
+ * Rather than having the parent do it as a magic kind of
+ * signal handler, we just set this to tell do_exit that we
+ * can be cleaned up without becoming a zombie. Note that
+ * we still call __wake_up_parent in this case, because a
+ * blocked sys_wait4 might now return -ECHILD.
+ *
+ * Whether we send SIGCHLD or not for SA_NOCLDWAIT
+ * is implementation-defined: we do (if you don't want
+ * it, just use SIG_IGN instead).
+ */
+ ret = tsk->exit_signal = -1;
+ if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
+ sig = -1;
+ }
+ if (valid_signal(sig) && sig > 0)
+ __group_send_sig_info(sig, &info, tsk->parent);
+ __wake_up_parent(tsk, tsk->parent);
+ spin_unlock_irqrestore(&psig->siglock, flags);
+
+ return ret;
+}
+
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+{
+ struct siginfo info;
+ unsigned long flags;
+ struct task_struct *parent;
+ struct sighand_struct *sighand;
+
+ if (tsk->ptrace & PT_PTRACED)
+ parent = tsk->parent;
+ else {
+ tsk = tsk->group_leader;
+ parent = tsk->real_parent;
+ }
+
+ info.si_signo = SIGCHLD;
+ info.si_errno = 0;
+ /*
+ * see comment in do_notify_parent() abot the following 3 lines
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ rcu_read_unlock();
+
+ info.si_uid = tsk->uid;
+
+ info.si_utime = cputime_to_clock_t(tsk->utime);
+ info.si_stime = cputime_to_clock_t(tsk->stime);
+
+ info.si_code = why;
+ switch (why) {
+ case CLD_CONTINUED:
+ info.si_status = SIGCONT;
+ break;
+ case CLD_STOPPED:
+ info.si_status = tsk->signal->group_exit_code & 0x7f;
+ break;
+ case CLD_TRAPPED:
+ info.si_status = tsk->exit_code & 0x7f;
+ break;
+ default:
+ BUG();
+ }
+
+ sighand = parent->sighand;
+ spin_lock_irqsave(&sighand->siglock, flags);
+ if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
+ !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
+ __group_send_sig_info(SIGCHLD, &info, parent);
+ /*
+ * Even if SIGCHLD is not generated, we must wake up wait4 calls.
+ */
+ __wake_up_parent(tsk, parent);
+ spin_unlock_irqrestore(&sighand->siglock, flags);
+}
+
+static inline int may_ptrace_stop(void)
+{
+ if (!likely(current->ptrace & PT_PTRACED))
+ return 0;
+ /*
+ * Are we in the middle of do_coredump?
+ * If so and our tracer is also part of the coredump stopping
+ * is a deadlock situation, and pointless because our tracer
+ * is dead so don't allow us to stop.
+ * If SIGKILL was already sent before the caller unlocked
+ * ->siglock we must see ->core_state != NULL. Otherwise it
+ * is safe to enter schedule().
+ */
+ if (unlikely(current->mm->core_state) &&
+ unlikely(current->mm == current->parent->mm))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+}
+
+/*
+ * This must be called with current->sighand->siglock held.
+ *
+ * This should be the path for all ptrace stops.
+ * We always set current->last_siginfo while stopped here.
+ * That makes it a way to test a stopped process for
+ * being ptrace-stopped vs being job-control-stopped.
+ *
+ * If we actually decide not to stop at all because the tracer
+ * is gone, we keep current->exit_code unless clear_code.
+ */
+static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+{
+ if (arch_ptrace_stop_needed(exit_code, info)) {
+ /*
+ * The arch code has something special to do before a
+ * ptrace stop. This is allowed to block, e.g. for faults
+ * on user stack pages. We can't keep the siglock while
+ * calling arch_ptrace_stop, so we must release it now.
+ * To preserve proper semantics, we must do this before
+ * any signal bookkeeping like checking group_stop_count.
+ * Meanwhile, a SIGKILL could come in before we retake the
+ * siglock. That must prevent us from sleeping in TASK_TRACED.
+ * So after regaining the lock, we must check for SIGKILL.
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+ arch_ptrace_stop(exit_code, info);
+ spin_lock_irq(&current->sighand->siglock);
+ if (sigkill_pending(current))
+ return;
+ }
+
+ /*
+ * If there is a group stop in progress,
+ * we must participate in the bookkeeping.
+ */
+ if (current->signal->group_stop_count > 0)
+ --current->signal->group_stop_count;
+
+ current->last_siginfo = info;
+ current->exit_code = exit_code;
+
+ /* Let the debugger run. */
+ __set_current_state(TASK_TRACED);
+ spin_unlock_irq(&current->sighand->siglock);
+ read_lock(&tasklist_lock);
+ if (may_ptrace_stop()) {
+ do_notify_parent_cldstop(current, CLD_TRAPPED);
+ read_unlock(&tasklist_lock);
+ schedule();
+ } else {
+ /*
+ * By the time we got the lock, our tracer went away.
+ * Don't drop the lock yet, another tracer may come.
+ */
+ __set_current_state(TASK_RUNNING);
+ if (clear_code)
+ current->exit_code = 0;
+ read_unlock(&tasklist_lock);
+ }
+
+ /*
+ * While in TASK_TRACED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
+ try_to_freeze();
+
+ /*
+ * We are back. Now reacquire the siglock before touching
+ * last_siginfo, so that we are sure to have synchronized with
+ * any signal-sending on another CPU that wants to examine it.
+ */
+ spin_lock_irq(&current->sighand->siglock);
+ current->last_siginfo = NULL;
+
+ /*
+ * Queued signals ignored us while we were stopped for tracing.
+ * So check for any that we should take before resuming user mode.
+ * This sets TIF_SIGPENDING, but never clears it.
+ */
+ recalc_sigpending_tsk(current);
+}
+
+void ptrace_notify(int exit_code)
+{
+ siginfo_t info;
+
+ BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
+ memset(&info, 0, sizeof info);
+ info.si_signo = SIGTRAP;
+ info.si_code = exit_code;
+ info.si_pid = task_pid_vnr(current);
+ info.si_uid = current->uid;
+
+ /* Let the debugger run. */
+ spin_lock_irq(&current->sighand->siglock);
+ ptrace_stop(exit_code, 1, &info);
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void
+finish_stop(int stop_count)
+{
+ /*
+ * If there are no other threads in the group, or if there is
+ * a group stop in progress and we are the last to stop,
+ * report to the parent. When ptraced, every thread reports itself.
+ */
+ if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, CLD_STOPPED);
+ read_unlock(&tasklist_lock);
+ }
+
+ do {
+ schedule();
+ } while (try_to_freeze());
+ /*
+ * Now we don't run again until continued.
+ */
+ current->exit_code = 0;
+}
+
+/*
+ * This performs the stopping for SIGSTOP and other stop signals.
+ * We have to stop all threads in the thread group.
+ * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns zero if we didn't stop and still hold the siglock.
+ */
+static int do_signal_stop(int signr)
+{
+ struct signal_struct *sig = current->signal;
+ int stop_count;
+
+ if (sig->group_stop_count > 0) {
+ /*
+ * There is a group stop in progress. We don't need to
+ * start another one.
+ */
+ stop_count = --sig->group_stop_count;
+ } else {
+ struct task_struct *t;
+
+ if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+ unlikely(signal_group_exit(sig)))
+ return 0;
+ /*
+ * There is no group stop already in progress.
+ * We must initiate one now.
+ */
+ sig->group_exit_code = signr;
+
+ stop_count = 0;
+ for (t = next_thread(current); t != current; t = next_thread(t))
+ /*
+ * Setting state to TASK_STOPPED for a group
+ * stop is always done with the siglock held,
+ * so this check has no races.
+ */
+ if (!(t->flags & PF_EXITING) &&
+ !task_is_stopped_or_traced(t)) {
+ stop_count++;
+ signal_wake_up(t, 0);
+ }
+ sig->group_stop_count = stop_count;
+ }
+
+ if (stop_count == 0)
+ sig->flags = SIGNAL_STOP_STOPPED;
+ current->exit_code = sig->group_exit_code;
+ __set_current_state(TASK_STOPPED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+ finish_stop(stop_count);
+ return 1;
+}
+
+static int ptrace_signal(int signr, siginfo_t *info,
+ struct pt_regs *regs, void *cookie)
+{
+ if (!(current->ptrace & PT_PTRACED))
+ return signr;
+
+ ptrace_signal_deliver(regs, cookie);
+
+ /* Let the debugger run. */
+ ptrace_stop(signr, 0, info);
+
+ /* We're back. Did the debugger cancel the sig? */
+ signr = current->exit_code;
+ if (signr == 0)
+ return signr;
+
+ current->exit_code = 0;
+
+ /* Update the siginfo structure if the signal has
+ changed. If the debugger wanted something
+ specific in the siginfo structure then it should
+ have updated *info via PTRACE_SETSIGINFO. */
+ if (signr != info->si_signo) {
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_pid_vnr(current->parent);
+ info->si_uid = current->parent->uid;
+ }
+
+ /* If the (new) signal is now blocked, requeue it. */
+ if (sigismember(&current->blocked, signr)) {
+ specific_send_sig_info(signr, info, current);
+ signr = 0;
+ }
+
+ return signr;
+}
+
+int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
+ struct pt_regs *regs, void *cookie)
+{
+ struct sighand_struct *sighand = current->sighand;
+ struct signal_struct *signal = current->signal;
+ int signr;
+
+relock:
+ /*
+ * We'll jump back here after any time we were stopped in TASK_STOPPED.
+ * While in TASK_STOPPED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
+ try_to_freeze();
+
+ spin_lock_irq(&sighand->siglock);
+ /*
+ * Every stopped thread goes here after wakeup. Check to see if
+ * we should notify the parent, prepare_signal(SIGCONT) encodes
+ * the CLD_ si_code into SIGNAL_CLD_MASK bits.
+ */
+ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
+ int why = (signal->flags & SIGNAL_STOP_CONTINUED)
+ ? CLD_CONTINUED : CLD_STOPPED;
+ signal->flags &= ~SIGNAL_CLD_MASK;
+ spin_unlock_irq(&sighand->siglock);
+
+ if (unlikely(!tracehook_notify_jctl(1, why)))
+ goto relock;
+
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current->group_leader, why);
+ read_unlock(&tasklist_lock);
+ goto relock;
+ }
+
+ for (;;) {
+ struct k_sigaction *ka;
+
+ if (unlikely(signal->group_stop_count > 0) &&
+ do_signal_stop(0))
+ goto relock;
+
+ /*
+ * Tracing can induce an artifical signal and choose sigaction.
+ * The return value in @signr determines the default action,
+ * but @info->si_signo is the signal number we will report.
+ */
+ signr = tracehook_get_signal(current, regs, info, return_ka);
+ if (unlikely(signr < 0))
+ goto relock;
+ if (unlikely(signr != 0))
+ ka = return_ka;
+ else {
+ signr = dequeue_signal(current, &current->blocked,
+ info);
+
+ if (!signr)
+ break; /* will return 0 */
+
+ if (signr != SIGKILL) {
+ signr = ptrace_signal(signr, info,
+ regs, cookie);
+ if (!signr)
+ continue;
+ }
+
+ ka = &sighand->action[signr-1];
+ }
+
+ if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
+ continue;
+ if (ka->sa.sa_handler != SIG_DFL) {
+ /* Run the handler. */
+ *return_ka = *ka;
+
+ if (ka->sa.sa_flags & SA_ONESHOT)
+ ka->sa.sa_handler = SIG_DFL;
+
+ break; /* will return non-zero "signr" value */
+ }
+
+ /*
+ * Now we are doing the default action for this signal.
+ */
+ if (sig_kernel_ignore(signr)) /* Default is nothing. */
+ continue;
+
+ /*
+ * Global init gets no signals it doesn't want.
+ */
+ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
+ !signal_group_exit(signal))
+ continue;
+
+ if (sig_kernel_stop(signr)) {
+ /*
+ * The default action is to stop all threads in
+ * the thread group. The job control signals
+ * do nothing in an orphaned pgrp, but SIGSTOP
+ * always works. Note that siglock needs to be
+ * dropped during the call to is_orphaned_pgrp()
+ * because of lock ordering with tasklist_lock.
+ * This allows an intervening SIGCONT to be posted.
+ * We need to check for that and bail out if necessary.
+ */
+ if (signr != SIGSTOP) {
+ spin_unlock_irq(&sighand->siglock);
+
+ /* signals can be posted during this window */
+
+ if (is_current_pgrp_orphaned())
+ goto relock;
+
+ spin_lock_irq(&sighand->siglock);
+ }
+
+ if (likely(do_signal_stop(info->si_signo))) {
+ /* It released the siglock. */
+ goto relock;
+ }
+
+ /*
+ * We didn't actually stop, due to a race
+ * with SIGCONT or something like that.
+ */
+ continue;
+ }
+
+ spin_unlock_irq(&sighand->siglock);
+
+ /*
+ * Anything else is fatal, maybe with a core dump.
+ */
+ current->flags |= PF_SIGNALED;
+
+ if (sig_kernel_coredump(signr)) {
+ if (print_fatal_signals)
+ print_fatal_signal(regs, info->si_signo);
+ /*
+ * If it was able to dump core, this kills all
+ * other threads in the group and synchronizes with
+ * their demise. If we lost the race with another
+ * thread getting here, it set group_exit_code
+ * first and our do_group_exit call below will use
+ * that value and ignore the one we pass it.
+ */
+ do_coredump(info->si_signo, info->si_signo, regs);
+ }
+
+ /*
+ * Death signals, no core dump.
+ */
+ do_group_exit(info->si_signo);
+ /* NOTREACHED */
+ }
+ spin_unlock_irq(&sighand->siglock);
+ return signr;
+}
+
+void exit_signals(struct task_struct *tsk)
+{
+ int group_stop = 0;
+ struct task_struct *t;
+
+ if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+ tsk->flags |= PF_EXITING;
+ return;
+ }
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ /*
+ * From now this task is not visible for group-wide signals,
+ * see wants_signal(), do_signal_stop().
+ */
+ tsk->flags |= PF_EXITING;
+ if (!signal_pending(tsk))
+ goto out;
+
+ /* It could be that __group_complete_signal() choose us to
+ * notify about group-wide signal. Another thread should be
+ * woken now to take the signal since we will not.
+ */
+ for (t = tsk; (t = next_thread(t)) != tsk; )
+ if (!signal_pending(t) && !(t->flags & PF_EXITING))
+ recalc_sigpending_and_wake(t);
+
+ if (unlikely(tsk->signal->group_stop_count) &&
+ !--tsk->signal->group_stop_count) {
+ tsk->signal->flags = SIGNAL_STOP_STOPPED;
+ group_stop = 1;
+ }
+out:
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(tsk, CLD_STOPPED);
+ read_unlock(&tasklist_lock);
+ }
+}
+
+EXPORT_SYMBOL(recalc_sigpending);
+EXPORT_SYMBOL_GPL(dequeue_signal);
+EXPORT_SYMBOL(flush_signals);
+EXPORT_SYMBOL(force_sig);
+EXPORT_SYMBOL(send_sig);
+EXPORT_SYMBOL(send_sig_info);
+EXPORT_SYMBOL(sigprocmask);
+EXPORT_SYMBOL(block_all_signals);
+EXPORT_SYMBOL(unblock_all_signals);
+
+
+/*
+ * System call entry points.
+ */
+
+SYSCALL_DEFINE0(restart_syscall)
+{
+ struct restart_block *restart = &current_thread_info()->restart_block;
+ return restart->fn(restart);
+}
+
+long do_no_restart_syscall(struct restart_block *param)
+{
+ return -EINTR;
+}
+
+/*
+ * We don't need to get the kernel lock - this is all local to this
+ * particular thread.. (and that's good, because this is _heavily_
+ * used by various programs)
+ */
+
+/*
+ * This is also useful for kernel threads that want to temporarily
+ * (or permanently) block certain signals.
+ *
+ * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
+ * interface happily blocks "unblockable" signals like SIGKILL
+ * and friends.
+ */
+int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
+{
+ int error;
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (oldset)
+ *oldset = current->blocked;
+
+ error = 0;
+ switch (how) {
+ case SIG_BLOCK:
+ sigorsets(&current->blocked, &current->blocked, set);
+ break;
+ case SIG_UNBLOCK:
+ signandsets(&current->blocked, &current->blocked, set);
+ break;
+ case SIG_SETMASK:
+ current->blocked = *set;
+ break;
+ default:
+ error = -EINVAL;
+ }
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return error;
+}
+
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+ sigset_t __user *, oset, size_t, sigsetsize)
+{
+ int error = -EINVAL;
+ sigset_t old_set, new_set;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ goto out;
+
+ if (set) {
+ error = -EFAULT;
+ if (copy_from_user(&new_set, set, sizeof(*set)))
+ goto out;
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, &old_set);
+ if (error)
+ goto out;
+ if (oset)
+ goto set_old;
+ } else if (oset) {
+ spin_lock_irq(&current->sighand->siglock);
+ old_set = current->blocked;
+ spin_unlock_irq(&current->sighand->siglock);
+
+ set_old:
+ error = -EFAULT;
+ if (copy_to_user(oset, &old_set, sizeof(*oset)))
+ goto out;
+ }
+ error = 0;
+out:
+ return error;
+}
+
+long do_sigpending(void __user *set, unsigned long sigsetsize)
+{
+ long error = -EINVAL;
+ sigset_t pending;
+
+ if (sigsetsize > sizeof(sigset_t))
+ goto out;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigorsets(&pending, &current->pending.signal,
+ &current->signal->shared_pending.signal);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* Outside the lock because only this thread touches it. */
+ sigandsets(&pending, &current->blocked, &pending);
+
+ error = -EFAULT;
+ if (!copy_to_user(set, &pending, sigsetsize))
+ error = 0;
+
+out:
+ return error;
+}
+
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+{
+ return do_sigpending(set, sigsetsize);
+}
+
+#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
+
+int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+{
+ int err;
+
+ if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+ return -EFAULT;
+ if (from->si_code < 0)
+ return __copy_to_user(to, from, sizeof(siginfo_t))
+ ? -EFAULT : 0;
+ /*
+ * If you change siginfo_t structure, please be sure
+ * this code is fixed accordingly.
+ * Please remember to update the signalfd_copyinfo() function
+ * inside fs/signalfd.c too, in case siginfo_t changes.
+ * It should never copy any pad contained in the structure
+ * to avoid security leaks, but must copy the generic
+ * 3 ints plus the relevant union member.
+ */
+ err = __put_user(from->si_signo, &to->si_signo);
+ err |= __put_user(from->si_errno, &to->si_errno);
+ err |= __put_user((short)from->si_code, &to->si_code);
+ switch (from->si_code & __SI_MASK) {
+ case __SI_KILL:
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ break;
+ case __SI_TIMER:
+ err |= __put_user(from->si_tid, &to->si_tid);
+ err |= __put_user(from->si_overrun, &to->si_overrun);
+ err |= __put_user(from->si_ptr, &to->si_ptr);
+ break;
+ case __SI_POLL:
+ err |= __put_user(from->si_band, &to->si_band);
+ err |= __put_user(from->si_fd, &to->si_fd);
+ break;
+ case __SI_FAULT:
+ err |= __put_user(from->si_addr, &to->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ err |= __put_user(from->si_trapno, &to->si_trapno);
+#endif
+ break;
+ case __SI_CHLD:
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_status, &to->si_status);
+ err |= __put_user(from->si_utime, &to->si_utime);
+ err |= __put_user(from->si_stime, &to->si_stime);
+ break;
+ case __SI_RT: /* This is not generated by the kernel as of now. */
+ case __SI_MESGQ: /* But this is */
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_ptr, &to->si_ptr);
+ break;
+ default: /* this is just in case for now ... */
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ break;
+ }
+ return err;
+}
+
+#endif
+
+SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
+ siginfo_t __user *, uinfo, const struct timespec __user *, uts,
+ size_t, sigsetsize)
+{
+ int ret, sig;
+ sigset_t these;
+ struct timespec ts;
+ siginfo_t info;
+ long timeout = 0;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&these, uthese, sizeof(these)))
+ return -EFAULT;
+
+ /*
+ * Invert the set of allowed signals to get those we
+ * want to block.
+ */
+ sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ signotset(&these);
+
+ if (uts) {
+ if (copy_from_user(&ts, uts, sizeof(ts)))
+ return -EFAULT;
+ if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
+ || ts.tv_sec < 0)
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &these, &info);
+ if (!sig) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ if (uts)
+ timeout = (timespec_to_jiffies(&ts)
+ + (ts.tv_sec || ts.tv_nsec));
+
+ if (timeout) {
+ /* None ready -- temporarily unblock those we're
+ * interested while we are sleeping in so that we'll
+ * be awakened when they arrive. */
+ current->real_blocked = current->blocked;
+ sigandsets(&current->blocked, &current->blocked, &these);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &these, &info);
+ current->blocked = current->real_blocked;
+ siginitset(&current->real_blocked, 0);
+ recalc_sigpending();
+ }
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (sig) {
+ ret = sig;
+ if (uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
+ }
+ } else {
+ ret = -EAGAIN;
+ if (timeout)
+ ret = -EINTR;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
+{
+ struct siginfo info;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = current->uid;
+
+ return kill_something_info(sig, &info, pid);
+}
+
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+ int error;
+ struct siginfo info;
+ struct task_struct *p;
+ unsigned long flags;
+
+ error = -ESRCH;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_TKILL;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = current->uid;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
+ error = check_kill_permission(sig, &info, p);
+ /*
+ * The null signal is a permissions and process existence
+ * probe. No signal is actually delivered.
+ *
+ * If lock_task_sighand() fails we pretend the task dies
+ * after receiving the signal. The window is tiny, and the
+ * signal is private anyway.
+ */
+ if (!error && sig && lock_task_sighand(p, &flags)) {
+ error = specific_send_sig_info(sig, &info, p);
+ unlock_task_sighand(p, &flags);
+ }
+ }
+ rcu_read_unlock();
+
+ return error;
+}
+
+/**
+ * sys_tgkill - send signal to one specific thread
+ * @tgid: the thread group ID of the thread
+ * @pid: the PID of the thread
+ * @sig: signal to be sent
+ *
+ * This syscall also checks the @tgid and returns -ESRCH even if the PID
+ * exists but it's not belonging to the target process anymore. This
+ * method solves the problem of threads exiting and PIDs getting reused.
+ */
+SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0 || tgid <= 0)
+ return -EINVAL;
+
+ return do_tkill(tgid, pid, sig);
+}
+
+/*
+ * Send a signal to only one task, even if it's a CLONE_THREAD task.
+ */
+SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0)
+ return -EINVAL;
+
+ return do_tkill(0, pid, sig);
+}
+
+SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ /* Not even root can pretend to send signals from the kernel.
+ Nor can they impersonate a kill(), which adds source info. */
+ if (info.si_code >= 0)
+ return -EPERM;
+ info.si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, &info, pid);
+}
+
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
+{
+ struct task_struct *t = current;
+ struct k_sigaction *k;
+ sigset_t mask;
+
+ if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
+ return -EINVAL;
+
+ k = &t->sighand->action[sig-1];
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (oact)
+ *oact = *k;
+
+ if (act) {
+ sigdelsetmask(&act->sa.sa_mask,
+ sigmask(SIGKILL) | sigmask(SIGSTOP));
+ *k = *act;
+ /*
+ * POSIX 3.3.1.3:
+ * "Setting a signal action to SIG_IGN for a signal that is
+ * pending shall cause the pending signal to be discarded,
+ * whether or not it is blocked."
+ *
+ * "Setting a signal action to SIG_DFL for a signal that is
+ * pending and whose default action is to ignore the signal
+ * (for example, SIGCHLD), shall cause the pending signal to
+ * be discarded, whether or not it is blocked"
+ */
+ if (sig_handler_ignored(sig_handler(t, sig), sig)) {
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+ rm_from_queue_full(&mask, &t->signal->shared_pending);
+ do {
+ rm_from_queue_full(&mask, &t->pending);
+ t = next_thread(t);
+ } while (t != current);
+ }
+ }
+
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+}
+
+int
+do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+{
+ stack_t oss;
+ int error;
+
+ if (uoss) {
+ oss.ss_sp = (void __user *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
+ }
+
+ if (uss) {
+ void __user *ss_sp;
+ size_t ss_size;
+ int ss_flags;
+
+ error = -EFAULT;
+ if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
+ || __get_user(ss_sp, &uss->ss_sp)
+ || __get_user(ss_flags, &uss->ss_flags)
+ || __get_user(ss_size, &uss->ss_size))
+ goto out;
+
+ error = -EPERM;
+ if (on_sig_stack(sp))
+ goto out;
+
+ error = -EINVAL;
+ /*
+ *
+ * Note - this code used to test ss_flags incorrectly
+ * old code may have been written using ss_flags==0
+ * to mean ss_flags==SS_ONSTACK (as this was the only
+ * way that worked) - this fix preserves that older
+ * mechanism
+ */
+ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+ goto out;
+
+ if (ss_flags == SS_DISABLE) {
+ ss_size = 0;
+ ss_sp = NULL;
+ } else {
+ error = -ENOMEM;
+ if (ss_size < MINSIGSTKSZ)
+ goto out;
+ }
+
+ current->sas_ss_sp = (unsigned long) ss_sp;
+ current->sas_ss_size = ss_size;
+ }
+
+ if (uoss) {
+ error = -EFAULT;
+ if (copy_to_user(uoss, &oss, sizeof(oss)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
+SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
+{
+ return do_sigpending(set, sizeof(*set));
+}
+
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+/* Some platforms have their own version with special arguments others
+ support only sys_rt_sigprocmask. */
+
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+ old_sigset_t __user *, oset)
+{
+ int error;
+ old_sigset_t old_set, new_set;
+
+ if (set) {
+ error = -EFAULT;
+ if (copy_from_user(&new_set, set, sizeof(*set)))
+ goto out;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sighand->siglock);
+ old_set = current->blocked.sig[0];
+
+ error = 0;
+ switch (how) {
+ default:
+ error = -EINVAL;
+ break;
+ case SIG_BLOCK:
+ sigaddsetmask(&current->blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&current->blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ current->blocked.sig[0] = new_set;
+ break;
+ }
+
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ if (error)
+ goto out;
+ if (oset)
+ goto set_old;
+ } else if (oset) {
+ old_set = current->blocked.sig[0];
+ set_old:
+ error = -EFAULT;
+ if (copy_to_user(oset, &old_set, sizeof(*oset)))
+ goto out;
+ }
+ error = 0;
+out:
+ return error;
+}
+#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
+
+#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct sigaction __user *, act,
+ struct sigaction __user *, oact,
+ size_t, sigsetsize)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret = -EINVAL;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ goto out;
+
+ if (act) {
+ if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
+ }
+
+ ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+
+ if (!ret && oact) {
+ if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+ }
+out:
+ return ret;
+}
+#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+
+#ifdef __ARCH_WANT_SYS_SGETMASK
+
+/*
+ * For backwards compatibility. Functionality superseded by sigprocmask.
+ */
+SYSCALL_DEFINE0(sgetmask)
+{
+ /* SMP safe */
+ return current->blocked.sig[0];
+}
+
+SYSCALL_DEFINE1(ssetmask, int, newmask)
+{
+ int old;
+
+ spin_lock_irq(&current->sighand->siglock);
+ old = current->blocked.sig[0];
+
+ siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
+ sigmask(SIGSTOP)));
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return old;
+}
+#endif /* __ARCH_WANT_SGETMASK */
+
+#ifdef __ARCH_WANT_SYS_SIGNAL
+/*
+ * For backwards compatibility. Functionality superseded by sigaction.
+ */
+SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret;
+
+ new_sa.sa.sa_handler = handler;
+ new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+ sigemptyset(&new_sa.sa.sa_mask);
+
+ ret = do_sigaction(sig, &new_sa, &old_sa);
+
+ return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
+}
+#endif /* __ARCH_WANT_SYS_SIGNAL */
+
+#ifdef __ARCH_WANT_SYS_PAUSE
+
+SYSCALL_DEFINE0(pause)
+{
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ return -ERESTARTNOHAND;
+}
+
+#endif
+
+#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
+{
+ sigset_t newset;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset, unewset, sizeof(newset)))
+ return -EFAULT;
+ sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sighand->siglock);
+ current->saved_sigmask = current->blocked;
+ current->blocked = newset;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+ return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
+void __init signals_init(void)
+{
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+}
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 0000000..75c8dde
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,437 @@
+/*
+ * Generic helpers for smp ipi calls
+ *
+ * (C) Jens Axboe <jens.axboe@oracle.com> 2008
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/smp.h>
+
+static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
+static LIST_HEAD(call_function_queue);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+enum {
+ CSD_FLAG_WAIT = 0x01,
+ CSD_FLAG_ALLOC = 0x02,
+};
+
+struct call_function_data {
+ struct call_single_data csd;
+ spinlock_t lock;
+ unsigned int refs;
+ cpumask_t cpumask;
+ struct rcu_head rcu_head;
+};
+
+struct call_single_queue {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+static int __cpuinit init_call_single_data(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct call_single_queue *q = &per_cpu(call_single_queue, i);
+
+ spin_lock_init(&q->lock);
+ INIT_LIST_HEAD(&q->list);
+ }
+ return 0;
+}
+early_initcall(init_call_single_data);
+
+static void csd_flag_wait(struct call_single_data *data)
+{
+ /* Wait for response */
+ do {
+ if (!(data->flags & CSD_FLAG_WAIT))
+ break;
+ cpu_relax();
+ } while (1);
+}
+
+/*
+ * Insert a previously allocated call_single_data element for execution
+ * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ */
+static void generic_exec_single(int cpu, struct call_single_data *data)
+{
+ struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+ int wait = data->flags & CSD_FLAG_WAIT, ipi;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dst->lock, flags);
+ ipi = list_empty(&dst->list);
+ list_add_tail(&data->list, &dst->list);
+ spin_unlock_irqrestore(&dst->lock, flags);
+
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
+ if (ipi)
+ arch_send_call_function_single_ipi(cpu);
+
+ if (wait)
+ csd_flag_wait(data);
+}
+
+static void rcu_free_call_data(struct rcu_head *head)
+{
+ struct call_function_data *data;
+
+ data = container_of(head, struct call_function_data, rcu_head);
+
+ kfree(data);
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function. Must be called with
+ * interrupts disabled.
+ */
+void generic_smp_call_function_interrupt(void)
+{
+ struct call_function_data *data;
+ int cpu = get_cpu();
+
+ /*
+ * It's ok to use list_for_each_rcu() here even though we may delete
+ * 'pos', since list_del_rcu() doesn't clear ->next
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+ int refs;
+
+ if (!cpu_isset(cpu, data->cpumask))
+ continue;
+
+ data->csd.func(data->csd.info);
+
+ spin_lock(&data->lock);
+ cpu_clear(cpu, data->cpumask);
+ WARN_ON(data->refs == 0);
+ data->refs--;
+ refs = data->refs;
+ spin_unlock(&data->lock);
+
+ if (refs)
+ continue;
+
+ spin_lock(&call_function_lock);
+ list_del_rcu(&data->csd.list);
+ spin_unlock(&call_function_lock);
+
+ if (data->csd.flags & CSD_FLAG_WAIT) {
+ /*
+ * serialize stores to data with the flag clear
+ * and wakeup
+ */
+ smp_wmb();
+ data->csd.flags &= ~CSD_FLAG_WAIT;
+ }
+ if (data->csd.flags & CSD_FLAG_ALLOC)
+ call_rcu(&data->rcu_head, rcu_free_call_data);
+ }
+ rcu_read_unlock();
+
+ put_cpu();
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function single. Must be called
+ * from the arch with interrupts disabled.
+ */
+void generic_smp_call_function_single_interrupt(void)
+{
+ struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+ LIST_HEAD(list);
+
+ /*
+ * Need to see other stores to list head for checking whether
+ * list is empty without holding q->lock
+ */
+ smp_read_barrier_depends();
+ while (!list_empty(&q->list)) {
+ unsigned int data_flags;
+
+ spin_lock(&q->lock);
+ list_replace_init(&q->list, &list);
+ spin_unlock(&q->lock);
+
+ while (!list_empty(&list)) {
+ struct call_single_data *data;
+
+ data = list_entry(list.next, struct call_single_data,
+ list);
+ list_del(&data->list);
+
+ /*
+ * 'data' can be invalid after this call if
+ * flags == 0 (when called through
+ * generic_exec_single(), so save them away before
+ * making the call.
+ */
+ data_flags = data->flags;
+
+ data->func(data->info);
+
+ if (data_flags & CSD_FLAG_WAIT) {
+ smp_wmb();
+ data->flags &= ~CSD_FLAG_WAIT;
+ } else if (data_flags & CSD_FLAG_ALLOC)
+ kfree(data);
+ }
+ /*
+ * See comment on outer loop
+ */
+ smp_read_barrier_depends();
+ }
+}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int wait)
+{
+ struct call_single_data d;
+ unsigned long flags;
+ /* prevent preemption and reschedule on another processor,
+ as well as CPU removal */
+ int me = get_cpu();
+ int err = 0;
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ if (cpu == me) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
+ struct call_single_data *data = NULL;
+
+ if (!wait) {
+ data = kmalloc(sizeof(*data), GFP_ATOMIC);
+ if (data)
+ data->flags = CSD_FLAG_ALLOC;
+ }
+ if (!data) {
+ data = &d;
+ data->flags = CSD_FLAG_WAIT;
+ }
+
+ data->func = func;
+ data->info = info;
+ generic_exec_single(cpu, data);
+ } else {
+ err = -ENXIO; /* CPU not online */
+ }
+
+ put_cpu();
+ return err;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+/**
+ * __smp_call_function_single(): Run a function on another CPU
+ * @cpu: The CPU to run on.
+ * @data: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
+ * data structure. Useful for embedding @data inside other structures, for
+ * instance.
+ *
+ */
+void __smp_call_function_single(int cpu, struct call_single_data *data)
+{
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+
+ generic_exec_single(cpu, data);
+}
+
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+ struct call_single_data data;
+ int cpu;
+
+ data.func = quiesce_dummy;
+ data.info = NULL;
+
+ for_each_cpu_mask(cpu, mask) {
+ data.flags = CSD_FLAG_WAIT;
+ generic_exec_single(cpu, &data);
+ }
+}
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
+ int wait)
+{
+ struct call_function_data d;
+ struct call_function_data *data = NULL;
+ cpumask_t allbutself;
+ unsigned long flags;
+ int cpu, num_cpus;
+ int slowpath = 0;
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ cpu = smp_processor_id();
+ allbutself = cpu_online_map;
+ cpu_clear(cpu, allbutself);
+ cpus_and(mask, mask, allbutself);
+ num_cpus = cpus_weight(mask);
+
+ /*
+ * If zero CPUs, return. If just a single CPU, turn this request
+ * into a targetted single call instead since it's faster.
+ */
+ if (!num_cpus)
+ return 0;
+ else if (num_cpus == 1) {
+ cpu = first_cpu(mask);
+ return smp_call_function_single(cpu, func, info, wait);
+ }
+
+ data = kmalloc(sizeof(*data), GFP_ATOMIC);
+ if (data) {
+ data->csd.flags = CSD_FLAG_ALLOC;
+ if (wait)
+ data->csd.flags |= CSD_FLAG_WAIT;
+ } else {
+ data = &d;
+ data->csd.flags = CSD_FLAG_WAIT;
+ wait = 1;
+ slowpath = 1;
+ }
+
+ spin_lock_init(&data->lock);
+ data->csd.func = func;
+ data->csd.info = info;
+ data->refs = num_cpus;
+ data->cpumask = mask;
+
+ spin_lock_irqsave(&call_function_lock, flags);
+ list_add_tail_rcu(&data->csd.list, &call_function_queue);
+ spin_unlock_irqrestore(&call_function_lock, flags);
+
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
+ /* Send a message to all CPUs in the map */
+ arch_send_call_function_ipi(mask);
+
+ /* optionally wait for the CPUs to complete */
+ if (wait) {
+ csd_flag_wait(&data->csd);
+ if (unlikely(slowpath))
+ smp_call_function_mask_quiesce_stack(mask);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_mask);
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func. In case of allocation
+ * failure, @wait will be implicitly turned on.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func)(void *), void *info, int wait)
+{
+ int ret;
+
+ preempt_disable();
+ ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(smp_call_function);
+
+void ipi_call_lock(void)
+{
+ spin_lock(&call_function_lock);
+}
+
+void ipi_call_unlock(void)
+{
+ spin_unlock(&call_function_lock);
+}
+
+void ipi_call_lock_irq(void)
+{
+ spin_lock_irq(&call_function_lock);
+}
+
+void ipi_call_unlock_irq(void)
+{
+ spin_unlock_irq(&call_function_lock);
+}
diff --git a/kernel/softirq.c b/kernel/softirq.c
new file mode 100644
index 0000000..e7c69a7
--- /dev/null
+++ b/kernel/softirq.c
@@ -0,0 +1,799 @@
+/*
+ * linux/kernel/softirq.c
+ *
+ * Copyright (C) 1992 Linus Torvalds
+ *
+ * Distribute under GPLv2.
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ * Remote softirq infrastructure is by Jens Axboe.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/tick.h>
+
+#include <asm/irq.h>
+/*
+ - No shared variables, all the data are CPU local.
+ - If a softirq needs serialization, let it serialize itself
+ by its own spinlocks.
+ - Even if softirq is serialized, only local cpu is marked for
+ execution. Hence, we get something sort of weak cpu binding.
+ Though it is still not clear, will it result in better locality
+ or will not.
+
+ Examples:
+ - NET RX softirq. It is multithreaded and does not require
+ any global serialization.
+ - NET TX softirq. It kicks software netdevice queues, hence
+ it is logically serialized per device, but this serialization
+ is invisible to common code.
+ - Tasklets: serialized wrt itself.
+ */
+
+#ifndef __ARCH_IRQ_STAT
+irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
+EXPORT_SYMBOL(irq_stat);
+#endif
+
+static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
+
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
+/*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+ * but we also don't want to introduce a worst case 1/HZ latency
+ * to the pending events, so lets the scheduler to balance
+ * the softirq load for us.
+ */
+static inline void wakeup_softirqd(void)
+{
+ /* Interrupts are disabled: no need to stop preemption */
+ struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+
+ if (tsk && tsk->state != TASK_RUNNING)
+ wake_up_process(tsk);
+}
+
+/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+static void __local_bh_disable(unsigned long ip)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ raw_local_irq_save(flags);
+ add_preempt_count(SOFTIRQ_OFFSET);
+ /*
+ * Were softirqs turned off above:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+}
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+ add_preempt_count(SOFTIRQ_OFFSET);
+ barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+void local_bh_disable(void)
+{
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+}
+
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+
+ /*
+ * softirqs should never be enabled by __local_bh_enable(),
+ * it always nests inside local_bh_enable() sections:
+ */
+ WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
+
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+EXPORT_SYMBOL_GPL(__local_bh_enable);
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+
+EXPORT_SYMBOL(_local_bh_enable);
+
+static inline void _local_bh_enable_ip(unsigned long ip)
+{
+ WARN_ON_ONCE(in_irq() || irqs_disabled());
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_disable();
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on(ip);
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
+ do_softirq();
+
+ dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_enable();
+#endif
+ preempt_check_resched();
+}
+
+void local_bh_enable(void)
+{
+ _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+ _local_bh_enable_ip(ip);
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/*
+ * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * and we fall back to softirqd after that.
+ *
+ * This number has been established via experimentation.
+ * The two things to balance is latency against fairness -
+ * we want to handle softirqs as soon as possible, but they
+ * should not be able to lock up the box.
+ */
+#define MAX_SOFTIRQ_RESTART 10
+
+asmlinkage void __do_softirq(void)
+{
+ struct softirq_action *h;
+ __u32 pending;
+ int max_restart = MAX_SOFTIRQ_RESTART;
+ int cpu;
+
+ pending = local_softirq_pending();
+ account_system_vtime(current);
+
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+ trace_softirq_enter();
+
+ cpu = smp_processor_id();
+restart:
+ /* Reset the pending bitmask before enabling irqs */
+ set_softirq_pending(0);
+
+ local_irq_enable();
+
+ h = softirq_vec;
+
+ do {
+ if (pending & 1) {
+ int prev_count = preempt_count();
+
+ h->action(h);
+
+ if (unlikely(prev_count != preempt_count())) {
+ printk(KERN_ERR "huh, entered softirq %td %p"
+ "with preempt_count %08x,"
+ " exited with %08x?\n", h - softirq_vec,
+ h->action, prev_count, preempt_count());
+ preempt_count() = prev_count;
+ }
+
+ rcu_bh_qsctr_inc(cpu);
+ }
+ h++;
+ pending >>= 1;
+ } while (pending);
+
+ local_irq_disable();
+
+ pending = local_softirq_pending();
+ if (pending && --max_restart)
+ goto restart;
+
+ if (pending)
+ wakeup_softirqd();
+
+ trace_softirq_exit();
+
+ account_system_vtime(current);
+ _local_bh_enable();
+}
+
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+
+ pending = local_softirq_pending();
+
+ if (pending)
+ __do_softirq();
+
+ local_irq_restore(flags);
+}
+
+#endif
+
+/*
+ * Enter an interrupt context.
+ */
+void irq_enter(void)
+{
+ int cpu = smp_processor_id();
+
+ if (idle_cpu(cpu) && !in_interrupt()) {
+ __irq_enter();
+ tick_check_idle(cpu);
+ } else
+ __irq_enter();
+}
+
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+# define invoke_softirq() __do_softirq()
+#else
+# define invoke_softirq() do_softirq()
+#endif
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+ account_system_vtime(current);
+ trace_hardirq_exit();
+ sub_preempt_count(IRQ_EXIT_OFFSET);
+ if (!in_interrupt() && local_softirq_pending())
+ invoke_softirq();
+
+#ifdef CONFIG_NO_HZ
+ /* Make sure that timer wheel updates are propagated */
+ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
+ tick_nohz_stop_sched_tick(0);
+ rcu_irq_exit();
+#endif
+ preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+inline void raise_softirq_irqoff(unsigned int nr)
+{
+ __raise_softirq_irqoff(nr);
+
+ /*
+ * If we're in an interrupt or softirq, we're done
+ * (this also catches softirq-disabled code). We will
+ * actually run the softirq once we return from
+ * the irq or softirq.
+ *
+ * Otherwise we wake up ksoftirqd to make sure we
+ * schedule the softirq soon.
+ */
+ if (!in_interrupt())
+ wakeup_softirqd();
+}
+
+void raise_softirq(unsigned int nr)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raise_softirq_irqoff(nr);
+ local_irq_restore(flags);
+}
+
+void open_softirq(int nr, void (*action)(struct softirq_action *))
+{
+ softirq_vec[nr].action = action;
+}
+
+/* Tasklets */
+struct tasklet_head
+{
+ struct tasklet_struct *head;
+ struct tasklet_struct **tail;
+};
+
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
+
+void __tasklet_schedule(struct tasklet_struct *t)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ t->next = NULL;
+ *__get_cpu_var(tasklet_vec).tail = t;
+ __get_cpu_var(tasklet_vec).tail = &(t->next);
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_schedule);
+
+void __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ t->next = NULL;
+ *__get_cpu_var(tasklet_hi_vec).tail = t;
+ __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+ raise_softirq_irqoff(HI_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+static void tasklet_action(struct softirq_action *a)
+{
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = __get_cpu_var(tasklet_vec).head;
+ __get_cpu_var(tasklet_vec).head = NULL;
+ __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+ local_irq_enable();
+
+ while (list) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (!atomic_read(&t->count)) {
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ BUG();
+ t->func(t->data);
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+
+ local_irq_disable();
+ t->next = NULL;
+ *__get_cpu_var(tasklet_vec).tail = t;
+ __get_cpu_var(tasklet_vec).tail = &(t->next);
+ __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = __get_cpu_var(tasklet_hi_vec).head;
+ __get_cpu_var(tasklet_hi_vec).head = NULL;
+ __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
+ local_irq_enable();
+
+ while (list) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (!atomic_read(&t->count)) {
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+ BUG();
+ t->func(t->data);
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+
+ local_irq_disable();
+ t->next = NULL;
+ *__get_cpu_var(tasklet_hi_vec).tail = t;
+ __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+
+void tasklet_init(struct tasklet_struct *t,
+ void (*func)(unsigned long), unsigned long data)
+{
+ t->next = NULL;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+ t->func = func;
+ t->data = data;
+}
+
+EXPORT_SYMBOL(tasklet_init);
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+ if (in_interrupt())
+ printk("Attempt to kill tasklet from interrupt\n");
+
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+ do
+ yield();
+ while (test_bit(TASKLET_STATE_SCHED, &t->state));
+ }
+ tasklet_unlock_wait(t);
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
+EXPORT_SYMBOL(tasklet_kill);
+
+DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+
+static void __local_trigger(struct call_single_data *cp, int softirq)
+{
+ struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+
+ list_add_tail(&cp->list, head);
+
+ /* Trigger the softirq only if the list was previously empty. */
+ if (head->next == &cp->list)
+ raise_softirq_irqoff(softirq);
+}
+
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+static void remote_softirq_receive(void *data)
+{
+ struct call_single_data *cp = data;
+ unsigned long flags;
+ int softirq;
+
+ softirq = cp->priv;
+
+ local_irq_save(flags);
+ __local_trigger(cp, softirq);
+ local_irq_restore(flags);
+}
+
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ if (cpu_online(cpu)) {
+ cp->func = remote_softirq_receive;
+ cp->info = cp;
+ cp->flags = 0;
+ cp->priv = softirq;
+
+ __smp_call_function_single(cpu, cp);
+ return 0;
+ }
+ return 1;
+}
+#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ return 1;
+}
+#endif
+
+/**
+ * __send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @this_cpu: the currently executing cpu
+ * @softirq: the softirq for the work
+ *
+ * Attempt to schedule softirq work on a remote cpu. If this cannot be
+ * done, the work is instead queued up on the local cpu.
+ *
+ * Interrupts must be disabled.
+ */
+void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+{
+ if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+ __local_trigger(cp, softirq);
+}
+EXPORT_SYMBOL(__send_remote_softirq);
+
+/**
+ * send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @softirq: the softirq for the work
+ *
+ * Like __send_remote_softirq except that disabling interrupts and
+ * computing the current cpu is done for the caller.
+ */
+void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ unsigned long flags;
+ int this_cpu;
+
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ __send_remote_softirq(cp, cpu, this_cpu, softirq);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(send_remote_softirq);
+
+static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+ int i;
+
+ local_irq_disable();
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+ struct list_head *local_head;
+
+ if (list_empty(head))
+ continue;
+
+ local_head = &__get_cpu_var(softirq_work_list[i]);
+ list_splice_init(head, local_head);
+ raise_softirq_irqoff(i);
+ }
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+ .notifier_call = remote_softirq_cpu_notify,
+};
+
+void __init softirq_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+
+ per_cpu(tasklet_vec, cpu).tail =
+ &per_cpu(tasklet_vec, cpu).head;
+ per_cpu(tasklet_hi_vec, cpu).tail =
+ &per_cpu(tasklet_hi_vec, cpu).head;
+ for (i = 0; i < NR_SOFTIRQS; i++)
+ INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
+ }
+
+ register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+
+ open_softirq(TASKLET_SOFTIRQ, tasklet_action);
+ open_softirq(HI_SOFTIRQ, tasklet_hi_action);
+}
+
+static int ksoftirqd(void * __bind_cpu)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ preempt_disable();
+ if (!local_softirq_pending()) {
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ while (local_softirq_pending()) {
+ /* Preempt disable stops cpu going offline.
+ If already offline, we'll be on wrong CPU:
+ don't process */
+ if (cpu_is_offline((long)__bind_cpu))
+ goto wait_to_die;
+ do_softirq();
+ preempt_enable_no_resched();
+ cond_resched();
+ preempt_disable();
+ }
+ preempt_enable();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+
+wait_to_die:
+ preempt_enable();
+ /* Wait for kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * tasklet_kill_immediate is called to remove a tasklet which can already be
+ * scheduled for execution on @cpu.
+ *
+ * Unlike tasklet_kill, this function removes the tasklet
+ * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
+ *
+ * When this function is called, @cpu must be in the CPU_DEAD state.
+ */
+void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
+{
+ struct tasklet_struct **i;
+
+ BUG_ON(cpu_online(cpu));
+ BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
+
+ if (!test_bit(TASKLET_STATE_SCHED, &t->state))
+ return;
+
+ /* CPU is dead, so no lock needed. */
+ for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
+ if (*i == t) {
+ *i = t->next;
+ /* If this was the tail element, move the tail ptr */
+ if (*i == NULL)
+ per_cpu(tasklet_vec, cpu).tail = i;
+ return;
+ }
+ }
+ BUG();
+}
+
+static void takeover_tasklets(unsigned int cpu)
+{
+ /* CPU is dead, so no lock needed. */
+ local_irq_disable();
+
+ /* Find end, append list for that CPU. */
+ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
+ *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
+ __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+ per_cpu(tasklet_vec, cpu).head = NULL;
+ per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+ }
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+
+ if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
+ *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+ __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+ per_cpu(tasklet_hi_vec, cpu).head = NULL;
+ per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+ }
+ raise_softirq_irqoff(HI_SOFTIRQ);
+
+ local_irq_enable();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+ struct task_struct *p;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+ if (IS_ERR(p)) {
+ printk("ksoftirqd for %i failed\n", hotcpu);
+ return NOTIFY_BAD;
+ }
+ kthread_bind(p, hotcpu);
+ per_cpu(ksoftirqd, hotcpu) = p;
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ wake_up_process(per_cpu(ksoftirqd, hotcpu));
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!per_cpu(ksoftirqd, hotcpu))
+ break;
+ /* Unbind so it can run. Fall thru. */
+ kthread_bind(per_cpu(ksoftirqd, hotcpu),
+ any_online_cpu(cpu_online_map));
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN: {
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+ p = per_cpu(ksoftirqd, hotcpu);
+ per_cpu(ksoftirqd, hotcpu) = NULL;
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+ kthread_stop(p);
+ takeover_tasklets(hotcpu);
+ break;
+ }
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+static __init int spawn_ksoftirqd(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+ BUG_ON(err == NOTIFY_BAD);
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
+ return 0;
+}
+early_initcall(spawn_ksoftirqd);
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+ int ret = 0;
+
+ preempt_disable();
+ ret = smp_call_function(func, info, wait);
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 0000000..dc0b3be
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,371 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+#include <asm/irq_regs.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int __read_mostly did_panic;
+int __read_mostly softlockup_thresh = 60;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * soft-lockup occurs:
+ */
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = softlock_panic,
+};
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static void __touch_softlockup_watchdog(void)
+{
+ int this_cpu = raw_smp_processor_id();
+
+ __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
+}
+
+void touch_softlockup_watchdog(void)
+{
+ __raw_get_cpu_var(touch_timestamp) = 0;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /* Cause each CPU to re-update its timestamp rather than complain */
+ for_each_online_cpu(cpu)
+ per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(void)
+{
+ int this_cpu = smp_processor_id();
+ unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+ unsigned long print_timestamp;
+ struct pt_regs *regs = get_irq_regs();
+ unsigned long now;
+
+ /* Is detection switched off? */
+ if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+ /* Be sure we don't false trigger if switched back on */
+ if (touch_timestamp)
+ per_cpu(touch_timestamp, this_cpu) = 0;
+ return;
+ }
+
+ if (touch_timestamp == 0) {
+ __touch_softlockup_watchdog();
+ return;
+ }
+
+ print_timestamp = per_cpu(print_timestamp, this_cpu);
+
+ /* report at most once a second */
+ if (print_timestamp == touch_timestamp || did_panic)
+ return;
+
+ /* do not print during early bootup: */
+ if (unlikely(system_state != SYSTEM_RUNNING)) {
+ __touch_softlockup_watchdog();
+ return;
+ }
+
+ now = get_timestamp(this_cpu);
+
+ /*
+ * Wake up the high-prio watchdog task twice per
+ * threshold timespan.
+ */
+ if (now > touch_timestamp + softlockup_thresh/2)
+ wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+ /* Warn about unreasonable delays: */
+ if (now <= (touch_timestamp + softlockup_thresh))
+ return;
+
+ per_cpu(print_timestamp, this_cpu) = touch_timestamp;
+
+ spin_lock(&print_lock);
+ printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
+ this_cpu, now - touch_timestamp,
+ current->comm, task_pid_nr(current));
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ spin_unlock(&print_lock);
+
+ if (softlockup_panic)
+ panic("softlockup: hung tasks");
+}
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+/*
+ * Only do the hung-tasks check on one CPU:
+ */
+static int check_cpu __read_mostly = -1;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+
+ if (softlockup_panic)
+ panic("softlockup: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp(this_cpu);
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+ unlock:
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void *__bind_cpu)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ int this_cpu = (long)__bind_cpu;
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+
+ /* initialize timestamp */
+ __touch_softlockup_watchdog();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Run briefly once per second to reset the softlockup timestamp.
+ * If this gets delayed for more than 60 seconds then the
+ * debug-printout triggers in softlockup_tick().
+ */
+ while (!kthread_should_stop()) {
+ __touch_softlockup_watchdog();
+ schedule();
+
+ if (kthread_should_stop())
+ break;
+
+ if (this_cpu == check_cpu) {
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+ struct task_struct *p;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ BUG_ON(per_cpu(watchdog_task, hotcpu));
+ p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
+ return NOTIFY_BAD;
+ }
+ per_cpu(touch_timestamp, hotcpu) = 0;
+ per_cpu(watchdog_task, hotcpu) = p;
+ kthread_bind(p, hotcpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ check_cpu = any_online_cpu(cpu_online_map);
+ wake_up_process(per_cpu(watchdog_task, hotcpu));
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ if (hotcpu == check_cpu) {
+ cpumask_t temp_cpu_online_map = cpu_online_map;
+
+ cpu_clear(hotcpu, temp_cpu_online_map);
+ check_cpu = any_online_cpu(temp_cpu_online_map);
+ }
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!per_cpu(watchdog_task, hotcpu))
+ break;
+ /* Unbind so it can run. Fall thru. */
+ kthread_bind(per_cpu(watchdog_task, hotcpu),
+ any_online_cpu(cpu_online_map));
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ p = per_cpu(watchdog_task, hotcpu);
+ per_cpu(watchdog_task, hotcpu) = NULL;
+ kthread_stop(p);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+ nosoftlockup = 1;
+ return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err;
+
+ if (nosoftlockup)
+ return 0;
+
+ err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ if (err == NOTIFY_BAD) {
+ BUG();
+ return 1;
+ }
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
+
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ return 0;
+}
+early_initcall(spawn_softlockup_task);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
new file mode 100644
index 0000000..29ab207
--- /dev/null
+++ b/kernel/spinlock.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (2004) Linus Torvalds
+ *
+ * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
+ */
+
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+int __lockfunc _spin_trylock(spinlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+
+ preempt_enable();
+ return 0;
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int __lockfunc _read_trylock(rwlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_read_trylock(lock)) {
+ rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+
+ preempt_enable();
+ return 0;
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int __lockfunc _write_trylock(rwlock_t *lock)
+{
+ preempt_disable();
+ if (_raw_write_trylock(lock)) {
+ rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+
+ preempt_enable();
+ return 0;
+}
+EXPORT_SYMBOL(_write_trylock);
+
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+EXPORT_SYMBOL(_read_lock);
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_LOCKDEP
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+#else
+ _raw_spin_lock_flags(lock, &flags);
+#endif
+ return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+void __lockfunc _spin_lock_irq(spinlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+void __lockfunc _spin_lock_bh(spinlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+ return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void __lockfunc _read_lock_irq(rwlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void __lockfunc _read_lock_bh(rwlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+ return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+void __lockfunc _write_lock_irq(rwlock_t *lock)
+{
+ local_irq_disable();
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void __lockfunc _write_lock_bh(rwlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+ preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype) \
+void __lockfunc _##op##_lock(locktype##_t *lock) \
+{ \
+ for (;;) { \
+ preempt_disable(); \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ preempt_enable(); \
+ \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ while (!op##_can_lock(lock) && (lock)->break_lock) \
+ _raw_##op##_relax(&lock->raw_lock); \
+ } \
+ (lock)->break_lock = 0; \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock); \
+ \
+unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ for (;;) { \
+ preempt_disable(); \
+ local_irq_save(flags); \
+ if (likely(_raw_##op##_trylock(lock))) \
+ break; \
+ local_irq_restore(flags); \
+ preempt_enable(); \
+ \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ while (!op##_can_lock(lock) && (lock)->break_lock) \
+ _raw_##op##_relax(&lock->raw_lock); \
+ } \
+ (lock)->break_lock = 0; \
+ return flags; \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irqsave); \
+ \
+void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
+{ \
+ _##op##_lock_irqsave(lock); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_irq); \
+ \
+void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ /* */ \
+ /* Careful: we must exclude softirqs too, hence the */ \
+ /* irq-disabling. We use the generic preemption-aware */ \
+ /* function: */ \
+ /**/ \
+ flags = _##op##_lock_irqsave(lock); \
+ local_bh_disable(); \
+ local_irq_restore(flags); \
+} \
+ \
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ * _[spin|read|write]_lock()
+ * _[spin|read|write]_lock_irq()
+ * _[spin|read|write]_lock_irqsave()
+ * _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif /* CONFIG_PREEMPT */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nested);
+
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_LOCKDEP
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+#else
+ _raw_spin_lock_flags(lock, &flags);
+#endif
+ return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ preempt_disable();
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
+
+#endif
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void __lockfunc _read_unlock_irq(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void __lockfunc _read_unlock_bh(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_read_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void __lockfunc _write_unlock_irq(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ local_irq_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void __lockfunc _write_unlock_bh(rwlock_t *lock)
+{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
+ _raw_write_unlock(lock);
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+{
+ local_bh_disable();
+ preempt_disable();
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ return 1;
+ }
+
+ preempt_enable_no_resched();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+ return 0;
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+notrace int in_lock_functions(unsigned long addr)
+{
+ /* Linker adds these: start and end of __lockfunc functions */
+ extern char __lock_text_start[], __lock_text_end[];
+
+ return addr >= (unsigned long)__lock_text_start
+ && addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
diff --git a/kernel/srcu.c b/kernel/srcu.c
new file mode 100644
index 0000000..b0aeeaf
--- /dev/null
+++ b/kernel/srcu.c
@@ -0,0 +1,257 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/srcu.h>
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ sp->completed = 0;
+ mutex_init(&sp->mutex);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ return (sp->per_cpu_ref ? 0 : -ENOMEM);
+}
+
+/*
+ * srcu_readers_active_idx -- returns approximate number of readers
+ * active on the specified rank of per-CPU counters.
+ */
+
+static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ int sum;
+
+ sum = 0;
+ for_each_possible_cpu(cpu)
+ sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+ return sum;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static int srcu_readers_active(struct srcu_struct *sp)
+{
+ return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ int sum;
+
+ sum = srcu_readers_active(sp);
+ WARN_ON(sum); /* Leakage unless caller handles error. */
+ if (sum != 0)
+ return;
+ free_percpu(sp->per_cpu_ref);
+ sp->per_cpu_ref = NULL;
+}
+
+/**
+ * srcu_read_lock - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ preempt_disable();
+ idx = sp->completed & 0x1;
+ barrier(); /* ensure compiler looks -once- at sp->completed. */
+ per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+ srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ preempt_enable();
+ return idx;
+}
+
+/**
+ * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ preempt_disable();
+ srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+ preempt_enable();
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates. Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchornize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = sp->completed;
+ mutex_lock(&sp->mutex);
+
+ /*
+ * Check to see if someone else did the work for us while we were
+ * waiting to acquire the lock. We need -two- advances of
+ * the counter, not just one. If there was but one, we might have
+ * shown up -after- our helper's first synchronize_sched(), thus
+ * having failed to prevent CPU-reordering races with concurrent
+ * srcu_read_unlock()s on other CPUs (see comment below). So we
+ * either (1) wait for two or (2) supply the second ourselves.
+ */
+
+ if ((sp->completed - idx) >= 2) {
+ mutex_unlock(&sp->mutex);
+ return;
+ }
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() ensures that any CPU that
+ * sees the new value of sp->completed will also see any preceding
+ * changes to data structures made by this CPU. This prevents
+ * some other CPU from reordering the accesses in its SRCU
+ * read-side critical section to precede the corresponding
+ * srcu_read_lock() -- ensuring that such references will in
+ * fact be protected.
+ *
+ * So it is now safe to do the flip.
+ */
+
+ idx = sp->completed & 0x1;
+ sp->completed++;
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * At this point, because of the preceding synchronize_sched(),
+ * all srcu_read_lock() calls using the old counters have completed.
+ * Their corresponding critical sections might well be still
+ * executing, but the srcu_read_lock() primitives themselves
+ * will have finished executing.
+ */
+
+ while (srcu_readers_active_idx(sp, idx))
+ schedule_timeout_interruptible(1);
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() forces all srcu_read_unlock()
+ * primitives that were executing concurrently with the preceding
+ * for_each_possible_cpu() loop to have completed by this point.
+ * More importantly, it also forces the corresponding SRCU read-side
+ * critical sections to have also completed, and the corresponding
+ * references to SRCU-protected data items to be dropped.
+ *
+ * Note:
+ *
+ * Despite what you might think at first glance, the
+ * preceding synchronize_sched() -must- be within the
+ * critical section ended by the following mutex_unlock().
+ * Otherwise, a task taking the early exit can race
+ * with a srcu_read_unlock(), which might have executed
+ * just before the preceding srcu_readers_active() check,
+ * and whose CPU might have reordered the srcu_read_unlock()
+ * with the preceding critical section. In this case, there
+ * is nothing preventing the synchronize_sched() task that is
+ * taking the early exit from freeing a data structure that
+ * is still being referenced (out of order) by the task
+ * doing the srcu_read_unlock().
+ *
+ * Alternatively, the comparison with "2" on the early exit
+ * could be changed to "3", but this increases synchronize_srcu()
+ * latency for bulk loads. So the current code is preferred.
+ */
+
+ mutex_unlock(&sp->mutex);
+}
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+
+long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->completed;
+}
+
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 0000000..94b527e
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,26 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+ int i;
+
+ if (WARN_ON(!trace->entries))
+ return;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ printk("%*c", 1 + spaces, ' ');
+ print_ip_sym(trace->entries[i]);
+ }
+}
+EXPORT_SYMBOL_GPL(print_stack_trace);
+
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
new file mode 100644
index 0000000..24e8cea
--- /dev/null
+++ b/kernel/stop_machine.c
@@ -0,0 +1,164 @@
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+/* This controls the threads on each CPU. */
+enum stopmachine_state {
+ /* Dummy starting state for thread. */
+ STOPMACHINE_NONE,
+ /* Awaiting everyone to be scheduled. */
+ STOPMACHINE_PREPARE,
+ /* Disable interrupts. */
+ STOPMACHINE_DISABLE_IRQ,
+ /* Run the function */
+ STOPMACHINE_RUN,
+ /* Exit */
+ STOPMACHINE_EXIT,
+};
+static enum stopmachine_state state;
+
+struct stop_machine_data {
+ int (*fn)(void *);
+ void *data;
+ int fnret;
+};
+
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static DEFINE_MUTEX(lock);
+
+static struct workqueue_struct *stop_machine_wq;
+static struct stop_machine_data active, idle;
+static const cpumask_t *active_cpus;
+static void *stop_machine_work;
+
+static void set_state(enum stopmachine_state newstate)
+{
+ /* Reset ack counter. */
+ atomic_set(&thread_ack, num_threads);
+ smp_wmb();
+ state = newstate;
+}
+
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
+{
+ if (atomic_dec_and_test(&thread_ack))
+ set_state(state + 1);
+}
+
+/* This is the actual function which stops the CPU. It runs
+ * in the context of a dedicated stopmachine workqueue. */
+static void stop_cpu(struct work_struct *unused)
+{
+ enum stopmachine_state curstate = STOPMACHINE_NONE;
+ struct stop_machine_data *smdata = &idle;
+ int cpu = smp_processor_id();
+ int err;
+
+ if (!active_cpus) {
+ if (cpu == first_cpu(cpu_online_map))
+ smdata = &active;
+ } else {
+ if (cpu_isset(cpu, *active_cpus))
+ smdata = &active;
+ }
+ /* Simple state machine */
+ do {
+ /* Chill out and ensure we re-read stopmachine_state. */
+ cpu_relax();
+ if (state != curstate) {
+ curstate = state;
+ switch (curstate) {
+ case STOPMACHINE_DISABLE_IRQ:
+ local_irq_disable();
+ hard_irq_disable();
+ break;
+ case STOPMACHINE_RUN:
+ /* On multiple CPUs only a single error code
+ * is needed to tell that something failed. */
+ err = smdata->fn(smdata->data);
+ if (err)
+ smdata->fnret = err;
+ break;
+ default:
+ break;
+ }
+ ack_state();
+ }
+ } while (curstate != STOPMACHINE_EXIT);
+
+ local_irq_enable();
+}
+
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
+{
+ return 0;
+}
+
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+{
+ struct work_struct *sm_work;
+ int i, ret;
+
+ /* Set up initial state. */
+ mutex_lock(&lock);
+ num_threads = num_online_cpus();
+ active_cpus = cpus;
+ active.fn = fn;
+ active.data = data;
+ active.fnret = 0;
+ idle.fn = chill;
+ idle.data = NULL;
+
+ set_state(STOPMACHINE_PREPARE);
+
+ /* Schedule the stop_cpu work on all cpus: hold this CPU so one
+ * doesn't hit this CPU until we're ready. */
+ get_cpu();
+ for_each_online_cpu(i) {
+ sm_work = percpu_ptr(stop_machine_work, i);
+ INIT_WORK(sm_work, stop_cpu);
+ queue_work_on(i, stop_machine_wq, sm_work);
+ }
+ /* This will release the thread on our CPU. */
+ put_cpu();
+ flush_workqueue(stop_machine_wq);
+ ret = active.fnret;
+ mutex_unlock(&lock);
+ return ret;
+}
+
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+{
+ int ret;
+
+ /* No CPUs can come up or down during this. */
+ get_online_cpus();
+ ret = __stop_machine(fn, data, cpus);
+ put_online_cpus();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(stop_machine);
+
+static int __init stop_machine_init(void)
+{
+ stop_machine_wq = create_rt_workqueue("kstop");
+ stop_machine_work = alloc_percpu(struct work_struct);
+ return 0;
+}
+core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
new file mode 100644
index 0000000..5608c66
--- /dev/null
+++ b/kernel/sys.c
@@ -0,0 +1,1800 @@
+/*
+ * linux/kernel/sys.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/resource.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/posix-timers.h>
+#include <linux/security.h>
+#include <linux/dcookies.h>
+#include <linux/suspend.h>
+#include <linux/tty.h>
+#include <linux/signal.h>
+#include <linux/cn_proc.h>
+#include <linux/getcpu.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/seccomp.h>
+#include <linux/cpu.h>
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/kprobes.h>
+#include <linux/user_namespace.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#ifndef SET_UNALIGN_CTL
+# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+#endif
+#ifndef GET_UNALIGN_CTL
+# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+#endif
+#ifndef SET_FPEMU_CTL
+# define SET_FPEMU_CTL(a,b) (-EINVAL)
+#endif
+#ifndef GET_FPEMU_CTL
+# define GET_FPEMU_CTL(a,b) (-EINVAL)
+#endif
+#ifndef SET_FPEXC_CTL
+# define SET_FPEXC_CTL(a,b) (-EINVAL)
+#endif
+#ifndef GET_FPEXC_CTL
+# define GET_FPEXC_CTL(a,b) (-EINVAL)
+#endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b) (-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b) (-EINVAL)
+#endif
+#ifndef GET_TSC_CTL
+# define GET_TSC_CTL(a) (-EINVAL)
+#endif
+#ifndef SET_TSC_CTL
+# define SET_TSC_CTL(a) (-EINVAL)
+#endif
+
+/*
+ * this is where the system-wide overflow UID and GID are defined, for
+ * architectures that now have 32-bit UID/GID but didn't in the past
+ */
+
+int overflowuid = DEFAULT_OVERFLOWUID;
+int overflowgid = DEFAULT_OVERFLOWGID;
+
+#ifdef CONFIG_UID16
+EXPORT_SYMBOL(overflowuid);
+EXPORT_SYMBOL(overflowgid);
+#endif
+
+/*
+ * the same as above, but for filesystems which can only store a 16-bit
+ * UID and GID. as such, this is needed on all architectures
+ */
+
+int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+
+EXPORT_SYMBOL(fs_overflowuid);
+EXPORT_SYMBOL(fs_overflowgid);
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
+
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+
+static int set_one_prio(struct task_struct *p, int niceval, int error)
+{
+ int no_nice;
+
+ if (p->uid != current->euid &&
+ p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+ error = -EPERM;
+ goto out;
+ }
+ if (niceval < task_nice(p) && !can_nice(p, niceval)) {
+ error = -EACCES;
+ goto out;
+ }
+ no_nice = security_task_setnice(p, niceval);
+ if (no_nice) {
+ error = no_nice;
+ goto out;
+ }
+ if (error == -ESRCH)
+ error = 0;
+ set_user_nice(p, niceval);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
+{
+ struct task_struct *g, *p;
+ struct user_struct *user;
+ int error = -EINVAL;
+ struct pid *pgrp;
+
+ if (which > PRIO_USER || which < PRIO_PROCESS)
+ goto out;
+
+ /* normalize: avoid signed division (rounding problems) */
+ error = -ESRCH;
+ if (niceval < -20)
+ niceval = -20;
+ if (niceval > 19)
+ niceval = 19;
+
+ read_lock(&tasklist_lock);
+ switch (which) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ user = current->user;
+ if (!who)
+ who = current->uid;
+ else
+ if ((who != current->uid) && !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
+
+ do_each_thread(g, p)
+ if (p->uid == who)
+ error = set_one_prio(p, niceval, error);
+ while_each_thread(g, p);
+ if (who != current->uid)
+ free_uid(user); /* For find_user() */
+ break;
+ }
+out_unlock:
+ read_unlock(&tasklist_lock);
+out:
+ return error;
+}
+
+/*
+ * Ugh. To avoid negative return values, "getpriority()" will
+ * not return the normal nice-value, but a negated value that
+ * has been offset by 20 (ie it returns 40..1 instead of -20..19)
+ * to stay compatible.
+ */
+SYSCALL_DEFINE2(getpriority, int, which, int, who)
+{
+ struct task_struct *g, *p;
+ struct user_struct *user;
+ long niceval, retval = -ESRCH;
+ struct pid *pgrp;
+
+ if (which > PRIO_USER || which < PRIO_PROCESS)
+ return -EINVAL;
+
+ read_lock(&tasklist_lock);
+ switch (which) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ user = current->user;
+ if (!who)
+ who = current->uid;
+ else
+ if ((who != current->uid) && !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
+
+ do_each_thread(g, p)
+ if (p->uid == who) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+ retval = niceval;
+ }
+ while_each_thread(g, p);
+ if (who != current->uid)
+ free_uid(user); /* for find_user() */
+ break;
+ }
+out_unlock:
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+/**
+ * emergency_restart - reboot the system
+ *
+ * Without shutting down any hardware or taking any locks
+ * reboot the system. This is called when we know we are in
+ * trouble so this is our best effort to reboot. This is
+ * safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+ machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart_prepare(char *cmd)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+ system_state = SYSTEM_RESTART;
+ device_shutdown();
+ sysdev_shutdown();
+}
+
+/**
+ * kernel_restart - reboot the system
+ * @cmd: pointer to buffer containing command to execute for restart
+ * or %NULL
+ *
+ * Shutdown everything and perform a clean reboot.
+ * This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+ kernel_restart_prepare(cmd);
+ if (!cmd)
+ printk(KERN_EMERG "Restarting system.\n");
+ else
+ printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+ machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+static void kernel_shutdown_prepare(enum system_states state)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list,
+ (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
+ system_state = state;
+ device_shutdown();
+}
+/**
+ * kernel_halt - halt the system
+ *
+ * Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+ kernel_shutdown_prepare(SYSTEM_HALT);
+ sysdev_shutdown();
+ printk(KERN_EMERG "System halted.\n");
+ machine_halt();
+}
+
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+/**
+ * kernel_power_off - power_off the system
+ *
+ * Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+ kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+ if (pm_power_off_prepare)
+ pm_power_off_prepare();
+ disable_nonboot_cpus();
+ sysdev_shutdown();
+ printk(KERN_EMERG "Power down.\n");
+ machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+ void __user *, arg)
+{
+ char buffer[256];
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ /* For safety, we require "magic" arguments. */
+ if (magic1 != LINUX_REBOOT_MAGIC1 ||
+ (magic2 != LINUX_REBOOT_MAGIC2 &&
+ magic2 != LINUX_REBOOT_MAGIC2A &&
+ magic2 != LINUX_REBOOT_MAGIC2B &&
+ magic2 != LINUX_REBOOT_MAGIC2C))
+ return -EINVAL;
+
+ /* Instead of trying to make the power_off code look like
+ * halt when pm_power_off is not set do it the easy way.
+ */
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ cmd = LINUX_REBOOT_CMD_HALT;
+
+ lock_kernel();
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART:
+ kernel_restart(NULL);
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_ON:
+ C_A_D = 1;
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ C_A_D = 0;
+ break;
+
+ case LINUX_REBOOT_CMD_HALT:
+ kernel_halt();
+ unlock_kernel();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ kernel_power_off();
+ unlock_kernel();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_RESTART2:
+ if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
+ unlock_kernel();
+ return -EFAULT;
+ }
+ buffer[sizeof(buffer) - 1] = '\0';
+
+ kernel_restart(buffer);
+ break;
+
+#ifdef CONFIG_KEXEC
+ case LINUX_REBOOT_CMD_KEXEC:
+ {
+ int ret;
+ ret = kernel_kexec();
+ unlock_kernel();
+ return ret;
+ }
+#endif
+
+#ifdef CONFIG_HIBERNATION
+ case LINUX_REBOOT_CMD_SW_SUSPEND:
+ {
+ int ret = hibernate();
+ unlock_kernel();
+ return ret;
+ }
+#endif
+
+ default:
+ unlock_kernel();
+ return -EINVAL;
+ }
+ unlock_kernel();
+ return 0;
+}
+
+static void deferred_cad(struct work_struct *dummy)
+{
+ kernel_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+ static DECLARE_WORK(cad_work, deferred_cad);
+
+ if (C_A_D)
+ schedule_work(&cad_work);
+ else
+ kill_cad_pid(SIGINT, 1);
+}
+
+/*
+ * Unprivileged users may change the real gid to the effective gid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real gid at all, or set the effective gid to a value not
+ * equal to the real gid, then the saved gid is set to the new effective gid.
+ *
+ * This makes it possible for a setgid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setregid() will be
+ * 100% compatible with BSD. A program which uses just setgid() will be
+ * 100% compatible with POSIX with saved IDs.
+ *
+ * SMP: There are not races, the GIDs are checked only by filesystem
+ * operations (as far as semantic preservation is concerned).
+ */
+SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+{
+ int old_rgid = current->gid;
+ int old_egid = current->egid;
+ int new_rgid = old_rgid;
+ int new_egid = old_egid;
+ int retval;
+
+ retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
+ if (retval)
+ return retval;
+
+ if (rgid != (gid_t) -1) {
+ if ((old_rgid == rgid) ||
+ (current->egid==rgid) ||
+ capable(CAP_SETGID))
+ new_rgid = rgid;
+ else
+ return -EPERM;
+ }
+ if (egid != (gid_t) -1) {
+ if ((old_rgid == egid) ||
+ (current->egid == egid) ||
+ (current->sgid == egid) ||
+ capable(CAP_SETGID))
+ new_egid = egid;
+ else
+ return -EPERM;
+ }
+ if (new_egid != old_egid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ if (rgid != (gid_t) -1 ||
+ (egid != (gid_t) -1 && egid != old_rgid))
+ current->sgid = new_egid;
+ current->fsgid = new_egid;
+ current->egid = new_egid;
+ current->gid = new_rgid;
+ key_fsgid_changed(current);
+ proc_id_connector(current, PROC_EVENT_GID);
+ return 0;
+}
+
+/*
+ * setgid() is implemented like SysV w/ SAVED_IDS
+ *
+ * SMP: Same implicit races as above.
+ */
+SYSCALL_DEFINE1(setgid, gid_t, gid)
+{
+ int old_egid = current->egid;
+ int retval;
+
+ retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
+ if (retval)
+ return retval;
+
+ if (capable(CAP_SETGID)) {
+ if (old_egid != gid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->gid = current->egid = current->sgid = current->fsgid = gid;
+ } else if ((gid == current->gid) || (gid == current->sgid)) {
+ if (old_egid != gid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->egid = current->fsgid = gid;
+ }
+ else
+ return -EPERM;
+
+ key_fsgid_changed(current);
+ proc_id_connector(current, PROC_EVENT_GID);
+ return 0;
+}
+
+static int set_user(uid_t new_ruid, int dumpclear)
+{
+ struct user_struct *new_user;
+
+ new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+ if (!new_user)
+ return -EAGAIN;
+
+ if (atomic_read(&new_user->processes) >=
+ current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
+ new_user != current->nsproxy->user_ns->root_user) {
+ free_uid(new_user);
+ return -EAGAIN;
+ }
+
+ switch_uid(new_user);
+
+ if (dumpclear) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->uid = new_ruid;
+ return 0;
+}
+
+/*
+ * Unprivileged users may change the real uid to the effective uid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real uid at all, or set the effective uid to a value not
+ * equal to the real uid, then the saved uid is set to the new effective uid.
+ *
+ * This makes it possible for a setuid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setreuid() will be
+ * 100% compatible with BSD. A program which uses just setuid() will be
+ * 100% compatible with POSIX with saved IDs.
+ */
+SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+{
+ int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+ int retval;
+
+ retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
+ if (retval)
+ return retval;
+
+ new_ruid = old_ruid = current->uid;
+ new_euid = old_euid = current->euid;
+ old_suid = current->suid;
+
+ if (ruid != (uid_t) -1) {
+ new_ruid = ruid;
+ if ((old_ruid != ruid) &&
+ (current->euid != ruid) &&
+ !capable(CAP_SETUID))
+ return -EPERM;
+ }
+
+ if (euid != (uid_t) -1) {
+ new_euid = euid;
+ if ((old_ruid != euid) &&
+ (current->euid != euid) &&
+ (current->suid != euid) &&
+ !capable(CAP_SETUID))
+ return -EPERM;
+ }
+
+ if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
+ return -EAGAIN;
+
+ if (new_euid != old_euid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->fsuid = current->euid = new_euid;
+ if (ruid != (uid_t) -1 ||
+ (euid != (uid_t) -1 && euid != old_ruid))
+ current->suid = current->euid;
+ current->fsuid = current->euid;
+
+ key_fsuid_changed(current);
+ proc_id_connector(current, PROC_EVENT_UID);
+
+ return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
+}
+
+
+
+/*
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
+ * Note that SAVED_ID's is deficient in that a setuid root program
+ * like sendmail, for example, cannot set its uid to be a normal
+ * user and then switch back, because if you're root, setuid() sets
+ * the saved uid too. If you don't like this, blame the bright people
+ * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
+ * will allow a root program to temporarily drop privileges and be able to
+ * regain them by swapping the real and effective uid.
+ */
+SYSCALL_DEFINE1(setuid, uid_t, uid)
+{
+ int old_euid = current->euid;
+ int old_ruid, old_suid, new_suid;
+ int retval;
+
+ retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
+ if (retval)
+ return retval;
+
+ old_ruid = current->uid;
+ old_suid = current->suid;
+ new_suid = old_suid;
+
+ if (capable(CAP_SETUID)) {
+ if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
+ return -EAGAIN;
+ new_suid = uid;
+ } else if ((uid != current->uid) && (uid != new_suid))
+ return -EPERM;
+
+ if (old_euid != uid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->fsuid = current->euid = uid;
+ current->suid = new_suid;
+
+ key_fsuid_changed(current);
+ proc_id_connector(current, PROC_EVENT_UID);
+
+ return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+}
+
+
+/*
+ * This function implements a generic ability to update ruid, euid,
+ * and suid. This allows you to implement the 4.4 compatible seteuid().
+ */
+SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+{
+ int old_ruid = current->uid;
+ int old_euid = current->euid;
+ int old_suid = current->suid;
+ int retval;
+
+ retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
+ if (retval)
+ return retval;
+
+ if (!capable(CAP_SETUID)) {
+ if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
+ (ruid != current->euid) && (ruid != current->suid))
+ return -EPERM;
+ if ((euid != (uid_t) -1) && (euid != current->uid) &&
+ (euid != current->euid) && (euid != current->suid))
+ return -EPERM;
+ if ((suid != (uid_t) -1) && (suid != current->uid) &&
+ (suid != current->euid) && (suid != current->suid))
+ return -EPERM;
+ }
+ if (ruid != (uid_t) -1) {
+ if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+ return -EAGAIN;
+ }
+ if (euid != (uid_t) -1) {
+ if (euid != current->euid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->euid = euid;
+ }
+ current->fsuid = current->euid;
+ if (suid != (uid_t) -1)
+ current->suid = suid;
+
+ key_fsuid_changed(current);
+ proc_id_connector(current, PROC_EVENT_UID);
+
+ return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+}
+
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+{
+ int retval;
+
+ if (!(retval = put_user(current->uid, ruid)) &&
+ !(retval = put_user(current->euid, euid)))
+ retval = put_user(current->suid, suid);
+
+ return retval;
+}
+
+/*
+ * Same as above, but for rgid, egid, sgid.
+ */
+SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+{
+ int retval;
+
+ retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
+ if (retval)
+ return retval;
+
+ if (!capable(CAP_SETGID)) {
+ if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
+ (rgid != current->egid) && (rgid != current->sgid))
+ return -EPERM;
+ if ((egid != (gid_t) -1) && (egid != current->gid) &&
+ (egid != current->egid) && (egid != current->sgid))
+ return -EPERM;
+ if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
+ (sgid != current->egid) && (sgid != current->sgid))
+ return -EPERM;
+ }
+ if (egid != (gid_t) -1) {
+ if (egid != current->egid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->egid = egid;
+ }
+ current->fsgid = current->egid;
+ if (rgid != (gid_t) -1)
+ current->gid = rgid;
+ if (sgid != (gid_t) -1)
+ current->sgid = sgid;
+
+ key_fsgid_changed(current);
+ proc_id_connector(current, PROC_EVENT_GID);
+ return 0;
+}
+
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+{
+ int retval;
+
+ if (!(retval = put_user(current->gid, rgid)) &&
+ !(retval = put_user(current->egid, egid)))
+ retval = put_user(current->sgid, sgid);
+
+ return retval;
+}
+
+
+/*
+ * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
+ * is used for "access()" and for the NFS daemon (letting nfsd stay at
+ * whatever uid it wants to). It normally shadows "euid", except when
+ * explicitly set by setfsuid() or for access..
+ */
+SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+{
+ int old_fsuid;
+
+ old_fsuid = current->fsuid;
+ if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
+ return old_fsuid;
+
+ if (uid == current->uid || uid == current->euid ||
+ uid == current->suid || uid == current->fsuid ||
+ capable(CAP_SETUID)) {
+ if (uid != old_fsuid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->fsuid = uid;
+ }
+
+ key_fsuid_changed(current);
+ proc_id_connector(current, PROC_EVENT_UID);
+
+ security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+
+ return old_fsuid;
+}
+
+/*
+ * Samma på svenska..
+ */
+SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+{
+ int old_fsgid;
+
+ old_fsgid = current->fsgid;
+ if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
+ return old_fsgid;
+
+ if (gid == current->gid || gid == current->egid ||
+ gid == current->sgid || gid == current->fsgid ||
+ capable(CAP_SETGID)) {
+ if (gid != old_fsgid) {
+ set_dumpable(current->mm, suid_dumpable);
+ smp_wmb();
+ }
+ current->fsgid = gid;
+ key_fsgid_changed(current);
+ proc_id_connector(current, PROC_EVENT_GID);
+ }
+ return old_fsgid;
+}
+
+void do_sys_times(struct tms *tms)
+{
+ struct task_cputime cputime;
+ cputime_t cutime, cstime;
+
+ spin_lock_irq(&current->sighand->siglock);
+ thread_group_cputime(current, &cputime);
+ cutime = current->signal->cutime;
+ cstime = current->signal->cstime;
+ spin_unlock_irq(&current->sighand->siglock);
+ tms->tms_utime = cputime_to_clock_t(cputime.utime);
+ tms->tms_stime = cputime_to_clock_t(cputime.stime);
+ tms->tms_cutime = cputime_to_clock_t(cutime);
+ tms->tms_cstime = cputime_to_clock_t(cstime);
+}
+
+SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tmp;
+
+ do_sys_times(&tmp);
+ if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+ return -EFAULT;
+ }
+ return (long) jiffies_64_to_clock_t(get_jiffies_64());
+}
+
+/*
+ * This needs some heavy checking ...
+ * I just haven't the stomach for it. I also don't fully
+ * understand sessions/pgrp etc. Let somebody who does explain it.
+ *
+ * OK, I think I have the protection semantics right.... this is really
+ * only important on a multi-user system anyway, to make sure one user
+ * can't send a signal to a process owned by another. -TYT, 12/12/91
+ *
+ * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
+ * LBT 04.03.94
+ */
+SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
+{
+ struct task_struct *p;
+ struct task_struct *group_leader = current->group_leader;
+ struct pid *pgrp;
+ int err;
+
+ if (!pid)
+ pid = task_pid_vnr(group_leader);
+ if (!pgid)
+ pgid = pid;
+ if (pgid < 0)
+ return -EINVAL;
+
+ /* From this point forward we keep holding onto the tasklist lock
+ * so that our parent does not change from under us. -DaveM
+ */
+ write_lock_irq(&tasklist_lock);
+
+ err = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+
+ err = -EINVAL;
+ if (!thread_group_leader(p))
+ goto out;
+
+ if (same_thread_group(p->real_parent, group_leader)) {
+ err = -EPERM;
+ if (task_session(p) != task_session(group_leader))
+ goto out;
+ err = -EACCES;
+ if (p->did_exec)
+ goto out;
+ } else {
+ err = -ESRCH;
+ if (p != group_leader)
+ goto out;
+ }
+
+ err = -EPERM;
+ if (p->signal->leader)
+ goto out;
+
+ pgrp = task_pid(p);
+ if (pgid != pid) {
+ struct task_struct *g;
+
+ pgrp = find_vpid(pgid);
+ g = pid_task(pgrp, PIDTYPE_PGID);
+ if (!g || task_session(g) != task_session(group_leader))
+ goto out;
+ }
+
+ err = security_task_setpgid(p, pgid);
+ if (err)
+ goto out;
+
+ if (task_pgrp(p) != pgrp) {
+ change_pid(p, PIDTYPE_PGID, pgrp);
+ set_task_pgrp(p, pid_nr(pgrp));
+ }
+
+ err = 0;
+out:
+ /* All paths lead to here, thus we are safe. -DaveM */
+ write_unlock_irq(&tasklist_lock);
+ return err;
+}
+
+SYSCALL_DEFINE1(getpgid, pid_t, pid)
+{
+ struct task_struct *p;
+ struct pid *grp;
+ int retval;
+
+ rcu_read_lock();
+ if (!pid)
+ grp = task_pgrp(current);
+ else {
+ retval = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+ grp = task_pgrp(p);
+ if (!grp)
+ goto out;
+
+ retval = security_task_getpgid(p);
+ if (retval)
+ goto out;
+ }
+ retval = pid_vnr(grp);
+out:
+ rcu_read_unlock();
+ return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_GETPGRP
+
+SYSCALL_DEFINE0(getpgrp)
+{
+ return sys_getpgid(0);
+}
+
+#endif
+
+SYSCALL_DEFINE1(getsid, pid_t, pid)
+{
+ struct task_struct *p;
+ struct pid *sid;
+ int retval;
+
+ rcu_read_lock();
+ if (!pid)
+ sid = task_session(current);
+ else {
+ retval = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+ sid = task_session(p);
+ if (!sid)
+ goto out;
+
+ retval = security_task_getsid(p);
+ if (retval)
+ goto out;
+ }
+ retval = pid_vnr(sid);
+out:
+ rcu_read_unlock();
+ return retval;
+}
+
+SYSCALL_DEFINE0(setsid)
+{
+ struct task_struct *group_leader = current->group_leader;
+ struct pid *sid = task_pid(group_leader);
+ pid_t session = pid_vnr(sid);
+ int err = -EPERM;
+
+ write_lock_irq(&tasklist_lock);
+ /* Fail if I am already a session leader */
+ if (group_leader->signal->leader)
+ goto out;
+
+ /* Fail if a process group id already exists that equals the
+ * proposed session id.
+ */
+ if (pid_task(sid, PIDTYPE_PGID))
+ goto out;
+
+ group_leader->signal->leader = 1;
+ __set_special_pids(sid);
+
+ proc_clear_tty(group_leader);
+
+ err = session;
+out:
+ write_unlock_irq(&tasklist_lock);
+ return err;
+}
+
+/*
+ * Supplementary group IDs
+ */
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+ struct group_info *group_info;
+ int nblocks;
+ int i;
+
+ nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+ /* Make sure we always allocate at least one indirect block pointer */
+ nblocks = nblocks ? : 1;
+ group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+ if (!group_info)
+ return NULL;
+ group_info->ngroups = gidsetsize;
+ group_info->nblocks = nblocks;
+ atomic_set(&group_info->usage, 1);
+
+ if (gidsetsize <= NGROUPS_SMALL)
+ group_info->blocks[0] = group_info->small_block;
+ else {
+ for (i = 0; i < nblocks; i++) {
+ gid_t *b;
+ b = (void *)__get_free_page(GFP_USER);
+ if (!b)
+ goto out_undo_partial_alloc;
+ group_info->blocks[i] = b;
+ }
+ }
+ return group_info;
+
+out_undo_partial_alloc:
+ while (--i >= 0) {
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+ return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+ if (group_info->blocks[0] != group_info->small_block) {
+ int i;
+ for (i = 0; i < group_info->nblocks; i++)
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+ struct group_info *group_info)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_to_user(grouplist, group_info->blocks[i], len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+ gid_t __user *grouplist)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_from_user(group_info->blocks[i], grouplist, len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+ int base, max, stride;
+ int gidsetsize = group_info->ngroups;
+
+ for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+ ; /* nothing */
+ stride /= 3;
+
+ while (stride) {
+ max = gidsetsize - stride;
+ for (base = 0; base < max; base++) {
+ int left = base;
+ int right = left + stride;
+ gid_t tmp = GROUP_AT(group_info, right);
+
+ while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+ GROUP_AT(group_info, right) =
+ GROUP_AT(group_info, left);
+ right = left;
+ left -= stride;
+ }
+ GROUP_AT(group_info, right) = tmp;
+ }
+ stride /= 3;
+ }
+}
+
+/* a simple bsearch */
+int groups_search(struct group_info *group_info, gid_t grp)
+{
+ unsigned int left, right;
+
+ if (!group_info)
+ return 0;
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ unsigned int mid = (left+right)/2;
+ int cmp = grp - GROUP_AT(group_info, mid);
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/* validate and set current->group_info */
+int set_current_groups(struct group_info *group_info)
+{
+ int retval;
+ struct group_info *old_info;
+
+ retval = security_task_setgroups(group_info);
+ if (retval)
+ return retval;
+
+ groups_sort(group_info);
+ get_group_info(group_info);
+
+ task_lock(current);
+ old_info = current->group_info;
+ current->group_info = group_info;
+ task_unlock(current);
+
+ put_group_info(old_info);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ int i = 0;
+
+ /*
+ * SMP: Nobody else can change our grouplist. Thus we are
+ * safe.
+ */
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ /* no need to grab task_lock here; it cannot change */
+ i = current->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups_to_user(grouplist, current->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ return i;
+}
+
+/*
+ * SMP: Our groups are copy-on-write. We can set them safely
+ * without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+ int retval = 1;
+ if (grp != current->fsgid)
+ retval = groups_search(current->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+ int retval = 1;
+ if (grp != current->egid)
+ retval = groups_search(current->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
+
+DECLARE_RWSEM(uts_sem);
+
+SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
+{
+ int errno = 0;
+
+ down_read(&uts_sem);
+ if (copy_to_user(name, utsname(), sizeof *name))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+}
+
+SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
+{
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+ struct new_utsname *u = utsname();
+
+ memcpy(u->nodename, tmp, len);
+ memset(u->nodename + len, 0, sizeof(u->nodename) - len);
+ errno = 0;
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+#ifdef __ARCH_WANT_SYS_GETHOSTNAME
+
+SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
+{
+ int i, errno;
+ struct new_utsname *u;
+
+ if (len < 0)
+ return -EINVAL;
+ down_read(&uts_sem);
+ u = utsname();
+ i = 1 + strlen(u->nodename);
+ if (i > len)
+ i = len;
+ errno = 0;
+ if (copy_to_user(name, u->nodename, i))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+}
+
+#endif
+
+/*
+ * Only setdomainname; getdomainname can be implemented by calling
+ * uname()
+ */
+SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
+{
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+ struct new_utsname *u = utsname();
+
+ memcpy(u->domainname, tmp, len);
+ memset(u->domainname + len, 0, sizeof(u->domainname) - len);
+ errno = 0;
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ else {
+ struct rlimit value;
+ task_lock(current->group_leader);
+ value = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+ }
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
+
+/*
+ * Back compatibility for getrlimit. Needed for some apps.
+ */
+
+SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct rlimit __user *, rlim)
+{
+ struct rlimit x;
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ task_lock(current->group_leader);
+ x = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ if (x.rlim_cur > 0x7FFFFFFF)
+ x.rlim_cur = 0x7FFFFFFF;
+ if (x.rlim_max > 0x7FFFFFFF)
+ x.rlim_max = 0x7FFFFFFF;
+ return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+}
+
+#endif
+
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ struct rlimit new_rlim, *old_rlim;
+ int retval;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ return -EFAULT;
+ if (new_rlim.rlim_cur > new_rlim.rlim_max)
+ return -EINVAL;
+ old_rlim = current->signal->rlim + resource;
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
+ return -EPERM;
+
+ retval = security_task_setrlimit(resource, &new_rlim);
+ if (retval)
+ return retval;
+
+ if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim.rlim_cur = 1;
+ }
+
+ task_lock(current->group_leader);
+ *old_rlim = new_rlim;
+ task_unlock(current->group_leader);
+
+ if (resource != RLIMIT_CPU)
+ goto out;
+
+ /*
+ * RLIMIT_CPU handling. Note that the kernel fails to return an error
+ * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
+ * very long-standing error, and fixing it now risks breakage of
+ * applications, so we live with it
+ */
+ if (new_rlim.rlim_cur == RLIM_INFINITY)
+ goto out;
+
+ update_rlimit_cpu(new_rlim.rlim_cur);
+out:
+ return 0;
+}
+
+/*
+ * It would make sense to put struct rusage in the task_struct,
+ * except that would make the task_struct be *really big*. After
+ * task_struct gets moved into malloc'ed memory, it would
+ * make sense to do this. It will make moving the rest of the information
+ * a lot simpler! (Which we're not doing right now because we're not
+ * measuring them yet).
+ *
+ * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
+ * races with threads incrementing their own counters. But since word
+ * reads are atomic, we either get new values or old values and we don't
+ * care which for the sums. We always take the siglock to protect reading
+ * the c* fields from p->signal from races with exit.c updating those
+ * fields when reaping, so a sample either gets all the additions of a
+ * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * Locking:
+ * We need to take the siglock for CHILDEREN, SELF and BOTH
+ * for the cases current multithreaded, non-current single threaded
+ * non-current multithreaded. Thread traversal is now safe with
+ * the siglock held.
+ * Strictly speaking, we donot need to take the siglock if we are current and
+ * single threaded, as no one else can take our signal_struct away, no one
+ * else can reap the children to update signal->c* counters, and no one else
+ * can race with the signal-> fields. If we do not take any lock, the
+ * signal-> fields could be read out of order while another thread was just
+ * exiting. So we should place a read memory barrier when we avoid the lock.
+ * On the writer side, write memory barrier is implied in __exit_signal
+ * as __exit_signal releases the siglock spinlock after updating the signal->
+ * fields. But we don't do this yet to keep things simple.
+ *
+ */
+
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
+{
+ r->ru_nvcsw += t->nvcsw;
+ r->ru_nivcsw += t->nivcsw;
+ r->ru_minflt += t->min_flt;
+ r->ru_majflt += t->maj_flt;
+ r->ru_inblock += task_io_get_inblock(t);
+ r->ru_oublock += task_io_get_oublock(t);
+}
+
+static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+{
+ struct task_struct *t;
+ unsigned long flags;
+ cputime_t utime, stime;
+ struct task_cputime cputime;
+
+ memset((char *) r, 0, sizeof *r);
+ utime = stime = cputime_zero;
+
+ if (who == RUSAGE_THREAD) {
+ utime = task_utime(current);
+ stime = task_stime(current);
+ accumulate_thread_rusage(p, r);
+ goto out;
+ }
+
+ if (!lock_task_sighand(p, &flags))
+ return;
+
+ switch (who) {
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+
+ if (who == RUSAGE_CHILDREN)
+ break;
+
+ case RUSAGE_SELF:
+ thread_group_cputime(p, &cputime);
+ utime = cputime_add(utime, cputime.utime);
+ stime = cputime_add(stime, cputime.stime);
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ t = next_thread(t);
+ } while (t != p);
+ break;
+
+ default:
+ BUG();
+ }
+ unlock_task_sighand(p, &flags);
+
+out:
+ cputime_to_timeval(utime, &r->ru_utime);
+ cputime_to_timeval(stime, &r->ru_stime);
+}
+
+int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+{
+ struct rusage r;
+ k_getrusage(p, who, &r);
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
+{
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+ return getrusage(current, who, ru);
+}
+
+SYSCALL_DEFINE1(umask, int, mask)
+{
+ mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
+ return mask;
+}
+
+SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ unsigned long, arg4, unsigned long, arg5)
+{
+ long error = 0;
+
+ if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+ return error;
+
+ switch (option) {
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
+ break;
+ }
+ current->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(current->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(current->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 < 0 || arg2 > 1) {
+ error = -EINVAL;
+ break;
+ }
+ set_dumpable(current->mm, arg2);
+ break;
+
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(current, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(current, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(current, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(current, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(current, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(current, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+
+ case PR_SET_NAME: {
+ struct task_struct *me = current;
+ unsigned char ncomm[sizeof(me->comm)];
+
+ ncomm[sizeof(me->comm)-1] = 0;
+ if (strncpy_from_user(ncomm, (char __user *)arg2,
+ sizeof(me->comm)-1) < 0)
+ return -EFAULT;
+ set_task_comm(me, ncomm);
+ return 0;
+ }
+ case PR_GET_NAME: {
+ struct task_struct *me = current;
+ unsigned char tcomm[sizeof(me->comm)];
+
+ get_task_comm(tcomm, me);
+ if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+ return -EFAULT;
+ return 0;
+ }
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(current, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(current, arg2);
+ break;
+
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ return error;
+}
+
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ struct getcpu_cache __user *, unused)
+{
+ int err = 0;
+ int cpu = raw_smp_processor_id();
+ if (cpup)
+ err |= put_user(cpu, cpup);
+ if (nodep)
+ err |= put_user(cpu_to_node(cpu), nodep);
+ return err ? -EFAULT : 0;
+}
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static void argv_cleanup(char **argv, char **envp)
+{
+ argv_free(argv);
+}
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+ int argc;
+ char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+ static char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL
+ };
+ int ret = -ENOMEM;
+ struct subprocess_info *info;
+
+ if (argv == NULL) {
+ printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+ __func__, poweroff_cmd);
+ goto out;
+ }
+
+ info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
+ if (info == NULL) {
+ argv_free(argv);
+ goto out;
+ }
+
+ call_usermodehelper_setcleanup(info, argv_cleanup);
+
+ ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+
+ out:
+ if (ret && force) {
+ printk(KERN_WARNING "Failed to start orderly shutdown: "
+ "forcing the issue\n");
+
+ /* I guess this should try to kick off some daemon to
+ sync and poweroff asap. Or not even bother syncing
+ if we're doing an emergency shutdown? */
+ emergency_sync();
+ kernel_power_off();
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
new file mode 100644
index 0000000..27dad29
--- /dev/null
+++ b/kernel/sys_ni.c
@@ -0,0 +1,177 @@
+
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/unistd.h>
+
+/* we can't #include <linux/syscalls.h> here,
+ but tell gcc to not warn with -Wmissing-prototypes */
+asmlinkage long sys_ni_syscall(void);
+
+/*
+ * Non-implemented system calls get redirected here.
+ */
+asmlinkage long sys_ni_syscall(void)
+{
+ return -ENOSYS;
+}
+
+cond_syscall(sys_nfsservctl);
+cond_syscall(sys_quotactl);
+cond_syscall(sys32_quotactl);
+cond_syscall(sys_acct);
+cond_syscall(sys_lookup_dcookie);
+cond_syscall(sys_swapon);
+cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_init_module);
+cond_syscall(sys_delete_module);
+cond_syscall(sys_socketpair);
+cond_syscall(sys_bind);
+cond_syscall(sys_listen);
+cond_syscall(sys_accept);
+cond_syscall(sys_accept4);
+cond_syscall(sys_connect);
+cond_syscall(sys_getsockname);
+cond_syscall(sys_getpeername);
+cond_syscall(sys_sendto);
+cond_syscall(sys_send);
+cond_syscall(sys_recvfrom);
+cond_syscall(sys_recv);
+cond_syscall(sys_socket);
+cond_syscall(sys_setsockopt);
+cond_syscall(compat_sys_setsockopt);
+cond_syscall(sys_getsockopt);
+cond_syscall(compat_sys_getsockopt);
+cond_syscall(sys_shutdown);
+cond_syscall(sys_sendmsg);
+cond_syscall(compat_sys_sendmsg);
+cond_syscall(sys_recvmsg);
+cond_syscall(compat_sys_recvmsg);
+cond_syscall(sys_socketcall);
+cond_syscall(sys_futex);
+cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
+cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
+cond_syscall(sys_epoll_ctl);
+cond_syscall(sys_epoll_wait);
+cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
+cond_syscall(sys_semget);
+cond_syscall(sys_semop);
+cond_syscall(sys_semtimedop);
+cond_syscall(sys_semctl);
+cond_syscall(sys_msgget);
+cond_syscall(sys_msgsnd);
+cond_syscall(sys_msgrcv);
+cond_syscall(sys_msgctl);
+cond_syscall(sys_shmget);
+cond_syscall(sys_shmat);
+cond_syscall(sys_shmdt);
+cond_syscall(sys_shmctl);
+cond_syscall(sys_mq_open);
+cond_syscall(sys_mq_unlink);
+cond_syscall(sys_mq_timedsend);
+cond_syscall(sys_mq_timedreceive);
+cond_syscall(sys_mq_notify);
+cond_syscall(sys_mq_getsetattr);
+cond_syscall(compat_sys_mq_open);
+cond_syscall(compat_sys_mq_timedsend);
+cond_syscall(compat_sys_mq_timedreceive);
+cond_syscall(compat_sys_mq_notify);
+cond_syscall(compat_sys_mq_getsetattr);
+cond_syscall(sys_mbind);
+cond_syscall(sys_get_mempolicy);
+cond_syscall(sys_set_mempolicy);
+cond_syscall(compat_sys_mbind);
+cond_syscall(compat_sys_get_mempolicy);
+cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_add_key);
+cond_syscall(sys_request_key);
+cond_syscall(sys_keyctl);
+cond_syscall(compat_sys_keyctl);
+cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
+cond_syscall(sys_vm86old);
+cond_syscall(sys_vm86);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_flock);
+cond_syscall(sys_io_setup);
+cond_syscall(sys_io_destroy);
+cond_syscall(sys_io_submit);
+cond_syscall(sys_io_cancel);
+cond_syscall(sys_io_getevents);
+cond_syscall(sys_syslog);
+
+/* arch-specific weak syscall entries */
+cond_syscall(sys_pciconfig_read);
+cond_syscall(sys_pciconfig_write);
+cond_syscall(sys_pciconfig_iobase);
+cond_syscall(sys32_ipc);
+cond_syscall(sys32_sysctl);
+cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
+cond_syscall(sys_subpage_prot);
+
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
+
+/* New file descriptors */
+cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
+cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
+cond_syscall(sys_timerfd_create);
+cond_syscall(sys_timerfd_settime);
+cond_syscall(sys_timerfd_gettime);
+cond_syscall(compat_sys_timerfd_settime);
+cond_syscall(compat_sys_timerfd_gettime);
+cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
new file mode 100644
index 0000000..68fd128
--- /dev/null
+++ b/kernel/sysctl.c
@@ -0,0 +1,3031 @@
+/*
+ * sysctl.c: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ * Added /proc support, Dec 1995
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+ * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+ * Dynamic registration fixes, Stephen Tweedie.
+ * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+ * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+ * Horn.
+ * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+ * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+ * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+ * Wendling.
+ * The list_for_each() macro wasn't appropriate for the sysctl loop.
+ * Removed it and replaced it with older style, 03/23/00, Bill Wendling
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/net.h>
+#include <linux/sysrq.h>
+#include <linux/highuid.h>
+#include <linux/writeback.h>
+#include <linux/hugetlb.h>
+#include <linux/initrd.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/limits.h>
+#include <linux/dcache.h>
+#include <linux/syscalls.h>
+#include <linux/vmstat.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
+#include <linux/reboot.h>
+#include <linux/ftrace.h>
+
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#include <asm/stacktrace.h>
+#include <asm/io.h>
+#endif
+
+static int deprecated_sysctl_warning(struct __sysctl_args *args);
+
+#if defined(CONFIG_SYSCTL)
+
+/* External variables not in a header file. */
+extern int C_A_D;
+extern int print_fatal_signals;
+extern int sysctl_overcommit_memory;
+extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
+extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
+extern int max_threads;
+extern int core_uses_pid;
+extern int suid_dumpable;
+extern char core_pattern[];
+extern int pid_max;
+extern int min_free_kbytes;
+extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
+extern int compat_log;
+extern int latencytop_enabled;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int rcutorture_runnable;
+#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+
+/* Constants used for minimum and maximum */
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
+static int one = 1;
+#endif
+
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+static int sixty = 60;
+static int neg_one = -1;
+#endif
+
+#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
+static int two = 2;
+#endif
+
+static int zero;
+static int one_hundred = 100;
+
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+static int maxolduid = 65535;
+static int minolduid;
+static int min_percpu_pagelist_fract = 8;
+
+static int ngroups_max = NGROUPS_MAX;
+
+#ifdef CONFIG_MODULES
+extern char modprobe_path[];
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+extern int sg_big_buff;
+#endif
+
+#ifdef CONFIG_SPARC
+#include <asm/system.h>
+#endif
+
+#ifdef __hppa__
+extern int pwrsw_enabled;
+extern int unaligned_enabled;
+#endif
+
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+extern int sysctl_ieee_emulation_warnings;
+#endif
+extern int sysctl_userprocess_debug;
+extern int spin_retry;
+#endif
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+extern int acct_parm[];
+#endif
+
+#ifdef CONFIG_IA64
+extern int no_unaligned_warning;
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
+static struct ctl_table root_table[];
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+ .count = 1,
+ .ctl_table = root_table,
+ .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+ .root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
+};
+static struct ctl_table_root sysctl_table_root = {
+ .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+ .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
+
+static struct ctl_table kern_table[];
+static struct ctl_table vm_table[];
+static struct ctl_table fs_table[];
+static struct ctl_table debug_table[];
+static struct ctl_table dev_table[];
+extern struct ctl_table random_table[];
+#ifdef CONFIG_INOTIFY_USER
+extern struct ctl_table inotify_table[];
+#endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+int sysctl_legacy_va_layout;
+#endif
+
+extern int prove_locking;
+extern int lock_stat;
+
+/* The default sysctl tables: */
+
+static struct ctl_table root_table[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .ctl_name = CTL_VM,
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .ctl_name = CTL_FS,
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .ctl_name = CTL_DEBUG,
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
+ { .ctl_name = 0 }
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#endif
+
+static struct ctl_table kern_table[] = {
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_min_granularity_ns",
+ .data = &sysctl_sched_min_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_nr_latency_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_latency_ns",
+ .data = &sysctl_sched_latency,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_nr_latency_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_wakeup_granularity_ns",
+ .data = &sysctl_sched_wakeup_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_wakeup_granularity_ns,
+ .extra2 = &max_wakeup_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_shares_ratelimit",
+ .data = &sysctl_sched_shares_ratelimit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_shares_thresh",
+ .data = &sysctl_sched_shares_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_features",
+ .data = &sysctl_sched_features,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_migration_cost",
+ .data = &sysctl_sched_migration_cost,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_nr_migrate",
+ .data = &sysctl_sched_nr_migrate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_rt_handler,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &sched_rt_handler,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_compat_yield",
+ .data = &sysctl_sched_compat_yield,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_LOCK_STAT
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = KERN_PANIC,
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_CORE_USES_PID,
+ .procname = "core_uses_pid",
+ .data = &core_uses_pid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_CORE_PATTERN,
+ .procname = "core_pattern",
+ .data = core_pattern,
+ .maxlen = CORENAME_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_taint,
+ },
+#endif
+#ifdef CONFIG_LATENCYTOP
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_BLK_DEV_INITRD
+ {
+ .ctl_name = KERN_REALROOTDEV,
+ .procname = "real-root-dev",
+ .data = &real_root_dev,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "print-fatal-signals",
+ .data = &print_fatal_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#ifdef CONFIG_SPARC
+ {
+ .ctl_name = KERN_SPARC_REBOOT,
+ .procname = "reboot-cmd",
+ .data = reboot_command,
+ .maxlen = 256,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+ {
+ .ctl_name = KERN_SPARC_STOP_A,
+ .procname = "stop-a",
+ .data = &stop_a_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_SPARC_SCONS_PWROFF,
+ .procname = "scons-poweroff",
+ .data = &scons_pwroff,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef __hppa__
+ {
+ .ctl_name = KERN_HPPA_PWRSW,
+ .procname = "soft-power",
+ .data = &pwrsw_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_HPPA_UNALIGNED,
+ .procname = "unaligned-trap",
+ .data = &unaligned_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = KERN_CTLALTDEL,
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#ifdef CONFIG_FUNCTION_TRACER
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "ftrace_enabled",
+ .data = &ftrace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ftrace_enable_sysctl,
+ },
+#endif
+#ifdef CONFIG_MODULES
+ {
+ .ctl_name = KERN_MODPROBE,
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+#endif
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+ {
+ .ctl_name = KERN_HOTPLUG,
+ .procname = "hotplug",
+ .data = &uevent_helper,
+ .maxlen = UEVENT_HELPER_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+ {
+ .ctl_name = KERN_SG_BIG_BUFF,
+ .procname = "sg-big-buff",
+ .data = &sg_big_buff,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+ {
+ .ctl_name = KERN_ACCT,
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+ {
+ .ctl_name = KERN_SYSRQ,
+ .procname = "sysrq",
+ .data = &__sysrq_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .data = NULL,
+ .maxlen = sizeof (int),
+ .mode = 0600,
+ .proc_handler = &proc_do_cad_pid,
+ },
+#endif
+ {
+ .ctl_name = KERN_MAX_THREADS,
+ .procname = "threads-max",
+ .data = &max_threads,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_RANDOM,
+ .procname = "random",
+ .mode = 0555,
+ .child = random_table,
+ },
+ {
+ .ctl_name = KERN_OVERFLOWUID,
+ .procname = "overflowuid",
+ .data = &overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+ {
+ .ctl_name = KERN_OVERFLOWGID,
+ .procname = "overflowgid",
+ .data = &overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+ {
+ .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
+ .procname = "ieee_emulation_warnings",
+ .data = &sysctl_ieee_emulation_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
+ .procname = "userprocess_debug",
+ .data = &sysctl_userprocess_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = KERN_PIDMAX,
+ .procname = "pid_max",
+ .data = &pid_max,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = sysctl_intvec,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+ {
+ .ctl_name = KERN_PANIC_ON_OOPS,
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#if defined CONFIG_PRINTK
+ {
+ .ctl_name = KERN_PRINTK,
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_PRINTK_RATELIMIT,
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = KERN_NGROUPS_MAX,
+ .procname = "ngroups_max",
+ .data = &ngroups_max,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ {
+ .ctl_name = KERN_UNKNOWN_NMI_PANIC,
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_nmi_enabled,
+ },
+#endif
+#if defined(CONFIG_X86)
+ {
+ .ctl_name = KERN_PANIC_ON_NMI,
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_BOOTLOADER_TYPE,
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kstack_depth_to_print",
+ .data = &kstack_depth_to_print,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_MMU)
+ {
+ .ctl_name = KERN_RANDOMIZE,
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_S390) && defined(CONFIG_SMP)
+ {
+ .ctl_name = KERN_SPIN_RETRY,
+ .procname = "spin_retry",
+ .data = &spin_retry,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif
+#ifdef CONFIG_IA64
+ {
+ .ctl_name = KERN_IA64_UNALIGNED,
+ .procname = "ignore-unaligned-usertrap",
+ .data = &no_unaligned_warning,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "softlockup_panic",
+ .data = &softlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "softlockup_thresh",
+ .data = &softlockup_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &neg_one,
+ .extra2 = &sixty,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+#endif
+#ifdef CONFIG_COMPAT
+ {
+ .ctl_name = KERN_COMPAT_LOG,
+ .procname = "compat-log",
+ .data = &compat_log,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_RT_MUTEXES
+ {
+ .ctl_name = KERN_MAX_LOCK_DEPTH,
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+#ifdef CONFIG_KEYS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "keys",
+ .mode = 0555,
+ .child = key_sysctls,
+ },
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "rcutorture_runnable",
+ .data = &rcutorture_runnable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "scan_unevictable_pages",
+ .data = &scan_unevictable_pages,
+ .maxlen = sizeof(scan_unevictable_pages),
+ .mode = 0644,
+ .proc_handler = &scan_unevictable_handler,
+ },
+#endif
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table vm_table[] = {
+ {
+ .ctl_name = VM_OVERCOMMIT_MEMORY,
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = VM_PANIC_ON_OOM,
+ .procname = "panic_on_oom",
+ .data = &sysctl_panic_on_oom,
+ .maxlen = sizeof(sysctl_panic_on_oom),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "oom_kill_allocating_task",
+ .data = &sysctl_oom_kill_allocating_task,
+ .maxlen = sizeof(sysctl_oom_kill_allocating_task),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "oom_dump_tasks",
+ .data = &sysctl_oom_dump_tasks,
+ .maxlen = sizeof(sysctl_oom_dump_tasks),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = VM_OVERCOMMIT_RATIO,
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = VM_PAGE_CLUSTER,
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = VM_DIRTY_BACKGROUND,
+ .procname = "dirty_background_ratio",
+ .data = &dirty_background_ratio,
+ .maxlen = sizeof(dirty_background_ratio),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .ctl_name = VM_DIRTY_RATIO,
+ .procname = "dirty_ratio",
+ .data = &vm_dirty_ratio,
+ .maxlen = sizeof(vm_dirty_ratio),
+ .mode = 0644,
+ .proc_handler = &dirty_ratio_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "dirty_writeback_centisecs",
+ .data = &dirty_writeback_interval,
+ .maxlen = sizeof(dirty_writeback_interval),
+ .mode = 0644,
+ .proc_handler = &dirty_writeback_centisecs_handler,
+ },
+ {
+ .procname = "dirty_expire_centisecs",
+ .data = &dirty_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
+ },
+ {
+ .ctl_name = VM_NR_PDFLUSH_THREADS,
+ .procname = "nr_pdflush_threads",
+ .data = &nr_pdflush_threads,
+ .maxlen = sizeof nr_pdflush_threads,
+ .mode = 0444 /* read-only*/,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = VM_SWAPPINESS,
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#ifdef CONFIG_HUGETLB_PAGE
+ {
+ .procname = "nr_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &hugetlb_sysctl_handler,
+ .extra1 = (void *)&hugetlb_zero,
+ .extra2 = (void *)&hugetlb_infinity,
+ },
+ {
+ .ctl_name = VM_HUGETLB_GROUP,
+ .procname = "hugetlb_shm_group",
+ .data = &sysctl_hugetlb_shm_group,
+ .maxlen = sizeof(gid_t),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hugepages_treat_as_movable",
+ .data = &hugepages_treat_as_movable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &hugetlb_treat_movable_handler,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nr_overcommit_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &hugetlb_overcommit_handler,
+ .extra1 = (void *)&hugetlb_zero,
+ .extra2 = (void *)&hugetlb_infinity,
+ },
+#endif
+ {
+ .ctl_name = VM_LOWMEM_RESERVE_RATIO,
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = &lowmem_reserve_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_DROP_PAGECACHE,
+ .procname = "drop_caches",
+ .data = &sysctl_drop_caches,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = drop_caches_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_MIN_FREE_KBYTES,
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = &min_free_kbytes_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
+ .procname = "percpu_pagelist_fraction",
+ .data = &percpu_pagelist_fraction,
+ .maxlen = sizeof(percpu_pagelist_fraction),
+ .mode = 0644,
+ .proc_handler = &percpu_pagelist_fraction_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_percpu_pagelist_fract,
+ },
+#ifdef CONFIG_MMU
+ {
+ .ctl_name = VM_MAX_MAP_COUNT,
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#endif
+ {
+ .ctl_name = VM_LAPTOP_MODE,
+ .procname = "laptop_mode",
+ .data = &laptop_mode,
+ .maxlen = sizeof(laptop_mode),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = VM_BLOCK_DUMP,
+ .procname = "block_dump",
+ .data = &block_dump,
+ .maxlen = sizeof(block_dump),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = VM_VFS_CACHE_PRESSURE,
+ .procname = "vfs_cache_pressure",
+ .data = &sysctl_vfs_cache_pressure,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+ {
+ .ctl_name = VM_LEGACY_VA_LAYOUT,
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .ctl_name = VM_ZONE_RECLAIM_MODE,
+ .procname = "zone_reclaim_mode",
+ .data = &zone_reclaim_mode,
+ .maxlen = sizeof(zone_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = VM_MIN_UNMAPPED,
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .ctl_name = VM_MIN_SLAB,
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
+#ifdef CONFIG_SMP
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+#endif
+#ifdef CONFIG_SECURITY
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "mmap_min_addr",
+ .data = &mmap_min_addr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = &numa_zonelist_order_handler,
+ .strategy = &sysctl_string,
+ },
+#endif
+#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
+ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
+ {
+ .ctl_name = VM_VDSO_ENABLED,
+ .procname = "vdso_enabled",
+ .data = &vdso_enabled,
+ .maxlen = sizeof(vdso_enabled),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_HIGHMEM
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "highmem_is_dirtyable",
+ .data = &vm_highmem_is_dirtyable,
+ .maxlen = sizeof(vm_highmem_is_dirtyable),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
+ { .ctl_name = 0 }
+};
+
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+static struct ctl_table binfmt_misc_table[] = {
+ { .ctl_name = 0 }
+};
+#endif
+
+static struct ctl_table fs_table[] = {
+ {
+ .ctl_name = FS_NRINODE,
+ .procname = "inode-nr",
+ .data = &inodes_stat,
+ .maxlen = 2*sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = FS_STATINODE,
+ .procname = "inode-state",
+ .data = &inodes_stat,
+ .maxlen = 7*sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "file-nr",
+ .data = &files_stat,
+ .maxlen = 3*sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_nr_files,
+ },
+ {
+ .ctl_name = FS_MAXFILE,
+ .procname = "file-max",
+ .data = &files_stat.max_files,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nr_open",
+ .data = &sysctl_nr_open,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &sysctl_nr_open_min,
+ .extra2 = &sysctl_nr_open_max,
+ },
+ {
+ .ctl_name = FS_DENTRY,
+ .procname = "dentry-state",
+ .data = &dentry_stat,
+ .maxlen = 6*sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = FS_OVERFLOWUID,
+ .procname = "overflowuid",
+ .data = &fs_overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+ {
+ .ctl_name = FS_OVERFLOWGID,
+ .procname = "overflowgid",
+ .data = &fs_overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+#ifdef CONFIG_FILE_LOCKING
+ {
+ .ctl_name = FS_LEASES,
+ .procname = "leases-enable",
+ .data = &leases_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_DNOTIFY
+ {
+ .ctl_name = FS_DIR_NOTIFY,
+ .procname = "dir-notify-enable",
+ .data = &dir_notify_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
+ {
+ .ctl_name = FS_LEASE_TIME,
+ .procname = "lease-break-time",
+ .data = &lease_break_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#endif
+#ifdef CONFIG_AIO
+ {
+ .procname = "aio-nr",
+ .data = &aio_nr,
+ .maxlen = sizeof(aio_nr),
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+ {
+ .procname = "aio-max-nr",
+ .data = &aio_max_nr,
+ .maxlen = sizeof(aio_max_nr),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif /* CONFIG_AIO */
+#ifdef CONFIG_INOTIFY_USER
+ {
+ .ctl_name = FS_INOTIFY,
+ .procname = "inotify",
+ .mode = 0555,
+ .child = inotify_table,
+ },
+#endif
+#ifdef CONFIG_EPOLL
+ {
+ .procname = "epoll",
+ .mode = 0555,
+ .child = epoll_table,
+ },
+#endif
+#endif
+ {
+ .ctl_name = KERN_SETUID_DUMPABLE,
+ .procname = "suid_dumpable",
+ .data = &suid_dumpable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "binfmt_misc",
+ .mode = 0555,
+ .child = binfmt_misc_table,
+ },
+#endif
+/*
+ * NOTE: do not add new entries to this table unless you have read
+ * Documentation/sysctl/ctl_unnumbered.txt
+ */
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table debug_table[] = {
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "exception-trace",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table dev_table[] = {
+ { .ctl_name = 0 }
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+ if (unlikely(p->unregistering))
+ return 0;
+ p->used++;
+ return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+ if (!--p->used)
+ if (unlikely(p->unregistering))
+ complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+ /*
+ * if p->used is 0, nobody will ever touch that entry again;
+ * we'll eliminate all paths to it before dropping sysctl_lock
+ */
+ if (unlikely(p->used)) {
+ struct completion wait;
+ init_completion(&wait);
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+ spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
+ }
+ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ list_del_init(&p->ctl_entry);
+}
+
+void sysctl_head_get(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ if (!--head->count)
+ kfree(head);
+ spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ if (!head)
+ BUG();
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
+void sysctl_head_finish(struct ctl_table_header *head)
+{
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root, namespaces);
+ return set;
+}
+
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = lookup_header_set(root, namespaces);
+ return &set->list;
+}
+
+struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+ struct ctl_table_header *prev)
+{
+ struct ctl_table_root *root;
+ struct list_head *header_list;
+ struct ctl_table_header *head;
+ struct list_head *tmp;
+
+ spin_lock(&sysctl_lock);
+ if (prev) {
+ head = prev;
+ tmp = &prev->ctl_entry;
+ unuse_table(prev);
+ goto next;
+ }
+ tmp = &root_table_header.ctl_entry;
+ for (;;) {
+ head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+
+ if (!use_table(head))
+ goto next;
+ spin_unlock(&sysctl_lock);
+ return head;
+ next:
+ root = head->root;
+ tmp = tmp->next;
+ header_list = lookup_header_list(root, namespaces);
+ if (tmp != header_list)
+ continue;
+
+ do {
+ root = list_entry(root->root_list.next,
+ struct ctl_table_root, root_list);
+ if (root == &sysctl_table_root)
+ goto out;
+ header_list = lookup_header_list(root, namespaces);
+ } while (list_empty(header_list));
+ tmp = header_list->next;
+ }
+out:
+ spin_unlock(&sysctl_lock);
+ return NULL;
+}
+
+struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+ return __sysctl_head_next(current->nsproxy, prev);
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+ spin_lock(&sysctl_lock);
+ list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+ spin_unlock(&sysctl_lock);
+}
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table_root *root,
+ struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ int op = 0, rc;
+
+ if (oldval)
+ op |= MAY_READ;
+ if (newval)
+ op |= MAY_WRITE;
+ if (sysctl_perm(root, table, op))
+ return -EPERM;
+
+ if (table->strategy) {
+ rc = table->strategy(table, oldval, oldlenp, newval, newlen);
+ if (rc < 0)
+ return rc;
+ if (rc > 0)
+ return 0;
+ }
+
+ /* If there is no strategy routine, or if the strategy returns
+ * zero, proceed with automatic r/w */
+ if (table->data && table->maxlen) {
+ rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
+ if (rc < 0)
+ return rc;
+ }
+ return 0;
+}
+
+static int parse_table(int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ struct ctl_table_root *root,
+ struct ctl_table *table)
+{
+ int n;
+repeat:
+ if (!nlen)
+ return -ENOTDIR;
+ if (get_user(n, name))
+ return -EFAULT;
+ for ( ; table->ctl_name || table->procname; table++) {
+ if (!table->ctl_name)
+ continue;
+ if (n == table->ctl_name) {
+ int error;
+ if (table->child) {
+ if (sysctl_perm(root, table, MAY_EXEC))
+ return -EPERM;
+ name++;
+ nlen--;
+ table = table->child;
+ goto repeat;
+ }
+ error = do_sysctl_strategy(root, table,
+ oldval, oldlenp,
+ newval, newlen);
+ return error;
+ }
+ }
+ return -ENOTDIR;
+}
+
+int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table_header *head;
+ int error = -ENOTDIR;
+
+ if (nlen <= 0 || nlen >= CTL_MAXNAME)
+ return -ENOTDIR;
+ if (oldval) {
+ int old_len;
+ if (!oldlenp || get_user(old_len, oldlenp))
+ return -EFAULT;
+ }
+
+ for (head = sysctl_head_next(NULL); head;
+ head = sysctl_head_next(head)) {
+ error = parse_table(name, nlen, oldval, oldlenp,
+ newval, newlen,
+ head->root, head->ctl_table);
+ if (error != -ENOTDIR) {
+ sysctl_head_finish(head);
+ break;
+ }
+ }
+ return error;
+}
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+ struct __sysctl_args tmp;
+ int error;
+
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+
+ error = deprecated_sysctl_warning(&tmp);
+ if (error)
+ goto out;
+
+ lock_kernel();
+ error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+ tmp.newval, tmp.newlen);
+ unlock_kernel();
+out:
+ return error;
+}
+#endif /* CONFIG_SYSCTL_SYSCALL */
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (!current->euid)
+ mode >>= 6;
+ else if (in_egroup_p(0))
+ mode >>= 3;
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+ int error;
+ int mode;
+
+ error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
+ if (error)
+ return error;
+
+ if (root->permissions)
+ mode = root->permissions(root, current->nsproxy, table);
+ else
+ mode = table->mode;
+
+ return test_perm(mode, op);
+}
+
+static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
+{
+ for (; table->ctl_name || table->procname; table++) {
+ table->parent = parent;
+ if (table->child)
+ sysctl_set_parent(table, table->child);
+ }
+}
+
+static __init int sysctl_init(void)
+{
+ sysctl_set_parent(NULL, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+ {
+ int err;
+ err = sysctl_check_table(current->nsproxy, root_table);
+ }
+#endif
+ return 0;
+}
+
+core_initcall(sysctl_init);
+
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+ struct ctl_table *table)
+{
+ struct ctl_table *p;
+ const char *s = branch->procname;
+
+ /* branch should have named subdirectory as its first element */
+ if (!s || !branch->child)
+ return NULL;
+
+ /* ... and nothing else */
+ if (branch[1].procname || branch[1].ctl_name)
+ return NULL;
+
+ /* table should contain subdirectory with the same name */
+ for (p = table; p->procname || p->ctl_name; p++) {
+ if (!p->child)
+ continue;
+ if (p->procname && strcmp(p->procname, s) == 0)
+ return p;
+ }
+ return NULL;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+ struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+ struct ctl_table *next;
+ int is_better = 0;
+ int not_in_parent = !p->attached_by;
+
+ while ((next = is_branch_in(by, to)) != NULL) {
+ if (by == q->attached_by)
+ is_better = 1;
+ if (to == p->attached_by)
+ not_in_parent = 1;
+ by = by->child;
+ to = next->child;
+ }
+
+ if (is_better && not_in_parent) {
+ q->attached_by = by;
+ q->attached_to = to;
+ q->parent = p;
+ }
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
+ * must be unique within that level of sysctl
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ * enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ * %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * strategy - the strategy routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table. The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry. This will be called before
+ * any automatic read or write of the data is performed.
+ *
+ * The strategy routine may return
+ *
+ * < 0 - Error occurred (error is passed to user process)
+ *
+ * 0 - OK - proceed with automatic read or write.
+ *
+ * > 0 - OK - read or write has been done by the strategy routine, so
+ * return immediately.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler). Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_root *root,
+ struct nsproxy *namespaces,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ struct ctl_table_header *header;
+ struct ctl_table *new, **prevp;
+ unsigned int n, npath;
+ struct ctl_table_set *set;
+
+ /* Count the path components */
+ for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+ ;
+
+ /*
+ * For each path component, allocate a 2-element ctl_table array.
+ * The first array element will be filled with the sysctl entry
+ * for this, the second will be the sentinel (ctl_name == 0).
+ *
+ * We allocate everything in one go so that we don't have to
+ * worry about freeing additional memory in unregister_sysctl_table.
+ */
+ header = kzalloc(sizeof(struct ctl_table_header) +
+ (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+ if (!header)
+ return NULL;
+
+ new = (struct ctl_table *) (header + 1);
+
+ /* Now connect the dots */
+ prevp = &header->ctl_table;
+ for (n = 0; n < npath; ++n, ++path) {
+ /* Copy the procname */
+ new->procname = path->procname;
+ new->ctl_name = path->ctl_name;
+ new->mode = 0555;
+
+ *prevp = new;
+ prevp = &new->child;
+
+ new += 2;
+ }
+ *prevp = table;
+ header->ctl_table_arg = table;
+
+ INIT_LIST_HEAD(&header->ctl_entry);
+ header->used = 0;
+ header->unregistering = NULL;
+ header->root = root;
+ sysctl_set_parent(NULL, header->ctl_table);
+ header->count = 1;
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+ if (sysctl_check_table(namespaces, header->ctl_table)) {
+ kfree(header);
+ return NULL;
+ }
+#endif
+ spin_lock(&sysctl_lock);
+ header->set = lookup_header_set(root, namespaces);
+ header->attached_by = header->ctl_table;
+ header->attached_to = root_table;
+ header->parent = &root_table_header;
+ for (set = header->set; set; set = set->parent) {
+ struct ctl_table_header *p;
+ list_for_each_entry(p, &set->list, ctl_entry) {
+ if (p->unregistering)
+ continue;
+ try_attach(p, header);
+ }
+ }
+ header->parent->count++;
+ list_add_tail(&header->ctl_entry, &header->set->list);
+ spin_unlock(&sysctl_lock);
+
+ return header;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+ path, table);
+}
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_paths(null_path, table);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ might_sleep();
+
+ if (header == NULL)
+ return;
+
+ spin_lock(&sysctl_lock);
+ start_unregistering(header);
+ if (!--header->parent->count) {
+ WARN_ON(1);
+ kfree(header->parent);
+ }
+ if (!--header->count)
+ kfree(header);
+ spin_unlock(&sysctl_lock);
+}
+
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
+void setup_sysctl_set(struct ctl_table_set *p,
+ struct ctl_table_set *parent,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ INIT_LIST_HEAD(&p->list);
+ p->parent = parent ? parent : &sysctl_table_root.default_set;
+ p->is_seen = is_seen;
+}
+
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+{
+ return NULL;
+}
+
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+void setup_sysctl_set(struct ctl_table_set *p,
+ struct ctl_table_set *parent,
+ int (*is_seen)(struct ctl_table_set *))
+{
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(void* data, int maxlen, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char __user *p;
+ char c;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ len = 0;
+ p = buffer;
+ while (len < *lenp) {
+ if (get_user(c, p++))
+ return -EFAULT;
+ if (c == 0 || c == '\n')
+ break;
+ len++;
+ }
+ if (len >= maxlen)
+ len = maxlen-1;
+ if(copy_from_user(data, buffer, len))
+ return -EFAULT;
+ ((char *) data)[len] = 0;
+ *ppos += *lenp;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ if(copy_to_user(buffer, data, len))
+ return -EFAULT;
+ if (len < *lenp) {
+ if(put_user('\n', ((char __user *) buffer) + len))
+ return -EFAULT;
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return _proc_do_string(table->data, table->maxlen, write, filp,
+ buffer, lenp, ppos);
+}
+
+
+static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ *valp = *negp ? -*lvalp : *lvalp;
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = -1;
+ *lvalp = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+#define TMPBUFLEN 21
+ int *i, vleft, first=1, neg, val;
+ unsigned long lval;
+ size_t left, len;
+
+ char buf[TMPBUFLEN], *p;
+ char __user *s = buffer;
+
+ if (!tbl_data || !table->maxlen || !*lenp ||
+ (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ for (; left && vleft--; i++, first=0) {
+ if (write) {
+ while (left) {
+ char c;
+ if (get_user(c, s))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ s++;
+ }
+ if (!left)
+ break;
+ neg = 0;
+ len = left;
+ if (len > sizeof(buf) - 1)
+ len = sizeof(buf) - 1;
+ if (copy_from_user(buf, s, len))
+ return -EFAULT;
+ buf[len] = 0;
+ p = buf;
+ if (*p == '-' && left > 1) {
+ neg = 1;
+ p++;
+ }
+ if (*p < '0' || *p > '9')
+ break;
+
+ lval = simple_strtoul(p, &p, 0);
+
+ len = p-buf;
+ if ((len < left) && *p && !isspace(*p))
+ break;
+ if (neg)
+ val = -val;
+ s += len;
+ left -= len;
+
+ if (conv(&neg, &lval, i, 1, data))
+ break;
+ } else {
+ p = buf;
+ if (!first)
+ *p++ = '\t';
+
+ if (conv(&neg, &lval, i, 0, data))
+ break;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", lval);
+ len = strlen(buf);
+ if (len > left)
+ len = left;
+ if(copy_to_user(s, buf, len))
+ return -EFAULT;
+ left -= len;
+ s += len;
+ }
+ }
+
+ if (!write && !first && left) {
+ if(put_user('\n', s))
+ return -EFAULT;
+ left--, s++;
+ }
+ if (write) {
+ while (left) {
+ char c;
+ if (get_user(c, s++))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ }
+ }
+ if (write && first)
+ return -EINVAL;
+ *lenp -= left;
+ *ppos += *lenp;
+ return 0;
+#undef TMPBUFLEN
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write, filp,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ NULL,NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ int i;
+ for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ if ((tmptaint >> i) & 1)
+ add_taint(i);
+ }
+ }
+
+ return err;
+}
+
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ if (write) {
+ int val = *negp ? -*lvalp : *lvalp;
+ if ((param->min && *param->min > val) ||
+ (param->max && *param->max < val))
+ return -EINVAL;
+ *valp = val;
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = -1;
+ *lvalp = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+ struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+#define TMPBUFLEN 21
+ unsigned long *i, *min, *max, val;
+ int vleft, first=1, neg;
+ size_t len, left;
+ char buf[TMPBUFLEN], *p;
+ char __user *s = buffer;
+
+ if (!data || !table->maxlen || !*lenp ||
+ (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
+
+ for (; left && vleft--; i++, min++, max++, first=0) {
+ if (write) {
+ while (left) {
+ char c;
+ if (get_user(c, s))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ s++;
+ }
+ if (!left)
+ break;
+ neg = 0;
+ len = left;
+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+ if (copy_from_user(buf, s, len))
+ return -EFAULT;
+ buf[len] = 0;
+ p = buf;
+ if (*p == '-' && left > 1) {
+ neg = 1;
+ p++;
+ }
+ if (*p < '0' || *p > '9')
+ break;
+ val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
+ len = p-buf;
+ if ((len < left) && *p && !isspace(*p))
+ break;
+ if (neg)
+ val = -val;
+ s += len;
+ left -= len;
+
+ if(neg)
+ continue;
+ if ((min && val < *min) || (max && val > *max))
+ continue;
+ *i = val;
+ } else {
+ p = buf;
+ if (!first)
+ *p++ = '\t';
+ sprintf(p, "%lu", convdiv * (*i) / convmul);
+ len = strlen(buf);
+ if (len > left)
+ len = left;
+ if(copy_to_user(s, buf, len))
+ return -EFAULT;
+ left -= len;
+ s += len;
+ }
+ }
+
+ if (!write && !first && left) {
+ if(put_user('\n', s))
+ return -EFAULT;
+ left--, s++;
+ }
+ if (write) {
+ while (left) {
+ char c;
+ if (get_user(c, s++))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ }
+ }
+ if (write && first)
+ return -EINVAL;
+ *lenp -= left;
+ *ppos += *lenp;
+ return 0;
+#undef TMPBUFLEN
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ filp, buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, filp, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > LONG_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = -1;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = -1;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = -1;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+#else /* CONFIG_PROC_FS */
+
+int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+
+#endif /* CONFIG_PROC_FS */
+
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+/*
+ * General sysctl support routines
+ */
+
+/* The generic sysctl data routine (used if no strategy routine supplied) */
+int sysctl_data(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ size_t len;
+
+ /* Get out of I don't have a variable */
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ if (oldval && oldlenp) {
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, table->data, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ if (newval && newlen) {
+ if (newlen > table->maxlen)
+ newlen = table->maxlen;
+
+ if (copy_from_user(table->data, newval, newlen))
+ return -EFAULT;
+ }
+ return 1;
+}
+
+/* The generic string strategy routine: */
+int sysctl_string(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ if (oldval && oldlenp) {
+ size_t bufsize;
+ if (get_user(bufsize, oldlenp))
+ return -EFAULT;
+ if (bufsize) {
+ size_t len = strlen(table->data), copied;
+
+ /* This shouldn't trigger for a well-formed sysctl */
+ if (len > table->maxlen)
+ len = table->maxlen;
+
+ /* Copy up to a max of bufsize-1 bytes of the string */
+ copied = (len >= bufsize) ? bufsize - 1 : len;
+
+ if (copy_to_user(oldval, table->data, copied) ||
+ put_user(0, (char __user *)(oldval + copied)))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+ if (newval && newlen) {
+ size_t len = newlen;
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if(copy_from_user(table->data, newval, len))
+ return -EFAULT;
+ if (len == table->maxlen)
+ len--;
+ ((char *) table->data)[len] = 0;
+ }
+ return 1;
+}
+
+/*
+ * This function makes sure that all of the integers in the vector
+ * are between the minimum and maximum values given in the arrays
+ * table->extra1 and table->extra2, respectively.
+ */
+int sysctl_intvec(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+
+ if (newval && newlen) {
+ int __user *vec = (int __user *) newval;
+ int *min = (int *) table->extra1;
+ int *max = (int *) table->extra2;
+ size_t length;
+ int i;
+
+ if (newlen % sizeof(int) != 0)
+ return -EINVAL;
+
+ if (!table->extra1 && !table->extra2)
+ return 0;
+
+ if (newlen > table->maxlen)
+ newlen = table->maxlen;
+ length = newlen / sizeof(int);
+
+ for (i = 0; i < length; i++) {
+ int value;
+ if (get_user(value, vec + i))
+ return -EFAULT;
+ if (min && value < min[i])
+ return -EINVAL;
+ if (max && value > max[i])
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Strategy function to convert jiffies to seconds */
+int sysctl_jiffies(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ if (oldval && oldlenp) {
+ size_t olen;
+
+ if (get_user(olen, oldlenp))
+ return -EFAULT;
+ if (olen) {
+ int val;
+
+ if (olen < sizeof(int))
+ return -EINVAL;
+
+ val = *(int *)(table->data) / HZ;
+ if (put_user(val, (int __user *)oldval))
+ return -EFAULT;
+ if (put_user(sizeof(int), oldlenp))
+ return -EFAULT;
+ }
+ }
+ if (newval && newlen) {
+ int new;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(new, (int __user *)newval))
+ return -EFAULT;
+ *(int *)(table->data) = new*HZ;
+ }
+ return 1;
+}
+
+/* Strategy function to convert jiffies to seconds */
+int sysctl_ms_jiffies(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ if (oldval && oldlenp) {
+ size_t olen;
+
+ if (get_user(olen, oldlenp))
+ return -EFAULT;
+ if (olen) {
+ int val;
+
+ if (olen < sizeof(int))
+ return -EINVAL;
+
+ val = jiffies_to_msecs(*(int *)(table->data));
+ if (put_user(val, (int __user *)oldval))
+ return -EFAULT;
+ if (put_user(sizeof(int), oldlenp))
+ return -EFAULT;
+ }
+ }
+ if (newval && newlen) {
+ int new;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(new, (int __user *)newval))
+ return -EFAULT;
+ *(int *)(table->data) = msecs_to_jiffies(new);
+ }
+ return 1;
+}
+
+
+
+#else /* CONFIG_SYSCTL_SYSCALL */
+
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+ struct __sysctl_args tmp;
+ int error;
+
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+
+ error = deprecated_sysctl_warning(&tmp);
+
+ /* If no error reading the parameters then just -ENOSYS ... */
+ if (!error)
+ error = -ENOSYS;
+
+ return error;
+}
+
+int sysctl_data(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+int sysctl_string(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+int sysctl_intvec(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+int sysctl_jiffies(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+int sysctl_ms_jiffies(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_SYSCTL_SYSCALL */
+
+static int deprecated_sysctl_warning(struct __sysctl_args *args)
+{
+ static int msg_count;
+ int name[CTL_MAXNAME];
+ int i;
+
+ /* Check args->nlen. */
+ if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
+ return -ENOTDIR;
+
+ /* Read in the sysctl name for better debug message logging */
+ for (i = 0; i < args->nlen; i++)
+ if (get_user(name[i], args->name + i))
+ return -EFAULT;
+
+ /* Ignore accesses to kernel.version */
+ if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+ return 0;
+
+ if (msg_count < 5) {
+ msg_count++;
+ printk(KERN_INFO
+ "warning: process `%s' used the deprecated sysctl "
+ "system call with ", current->comm);
+ for (i = 0; i < args->nlen; i++)
+ printk("%d.", name[i]);
+ printk("\n");
+ }
+ return 0;
+}
+
+/*
+ * No sense putting this after each symbol definition, twice,
+ * exception granted :-)
+ */
+EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+EXPORT_SYMBOL(proc_dostring);
+EXPORT_SYMBOL(proc_doulongvec_minmax);
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(register_sysctl_paths);
+EXPORT_SYMBOL(sysctl_intvec);
+EXPORT_SYMBOL(sysctl_jiffies);
+EXPORT_SYMBOL(sysctl_ms_jiffies);
+EXPORT_SYMBOL(sysctl_string);
+EXPORT_SYMBOL(sysctl_data);
+EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
new file mode 100644
index 0000000..c35da23
--- /dev/null
+++ b/kernel/sysctl_check.c
@@ -0,0 +1,1545 @@
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include <linux/sunrpc/debug.h>
+#include <linux/string.h>
+#include <net/ip_vs.h>
+
+struct trans_ctl_table {
+ int ctl_name;
+ const char *procname;
+ const struct trans_ctl_table *child;
+};
+
+static const struct trans_ctl_table trans_random_table[] = {
+ { RANDOM_POOLSIZE, "poolsize" },
+ { RANDOM_ENTROPY_COUNT, "entropy_avail" },
+ { RANDOM_READ_THRESH, "read_wakeup_threshold" },
+ { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
+ { RANDOM_BOOT_ID, "boot_id" },
+ { RANDOM_UUID, "uuid" },
+ {}
+};
+
+static const struct trans_ctl_table trans_pty_table[] = {
+ { PTY_MAX, "max" },
+ { PTY_NR, "nr" },
+ {}
+};
+
+static const struct trans_ctl_table trans_kern_table[] = {
+ { KERN_OSTYPE, "ostype" },
+ { KERN_OSRELEASE, "osrelease" },
+ /* KERN_OSREV not used */
+ { KERN_VERSION, "version" },
+ /* KERN_SECUREMASK not used */
+ /* KERN_PROF not used */
+ { KERN_NODENAME, "hostname" },
+ { KERN_DOMAINNAME, "domainname" },
+
+ { KERN_PANIC, "panic" },
+ { KERN_REALROOTDEV, "real-root-dev" },
+
+ { KERN_SPARC_REBOOT, "reboot-cmd" },
+ { KERN_CTLALTDEL, "ctrl-alt-del" },
+ { KERN_PRINTK, "printk" },
+
+ /* KERN_NAMETRANS not used */
+ /* KERN_PPC_HTABRECLAIM not used */
+ /* KERN_PPC_ZEROPAGED not used */
+ { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
+
+ { KERN_MODPROBE, "modprobe" },
+ { KERN_SG_BIG_BUFF, "sg-big-buff" },
+ { KERN_ACCT, "acct" },
+ { KERN_PPC_L2CR, "l2cr" },
+
+ /* KERN_RTSIGNR not used */
+ /* KERN_RTSIGMAX not used */
+
+ { KERN_SHMMAX, "shmmax" },
+ { KERN_MSGMAX, "msgmax" },
+ { KERN_MSGMNB, "msgmnb" },
+ /* KERN_MSGPOOL not used*/
+ { KERN_SYSRQ, "sysrq" },
+ { KERN_MAX_THREADS, "threads-max" },
+ { KERN_RANDOM, "random", trans_random_table },
+ { KERN_SHMALL, "shmall" },
+ { KERN_MSGMNI, "msgmni" },
+ { KERN_SEM, "sem" },
+ { KERN_SPARC_STOP_A, "stop-a" },
+ { KERN_SHMMNI, "shmmni" },
+
+ { KERN_OVERFLOWUID, "overflowuid" },
+ { KERN_OVERFLOWGID, "overflowgid" },
+
+ { KERN_HOTPLUG, "hotplug", },
+ { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
+
+ { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
+ { KERN_CORE_USES_PID, "core_uses_pid" },
+ { KERN_TAINTED, "tainted" },
+ { KERN_CADPID, "cad_pid" },
+ { KERN_PIDMAX, "pid_max" },
+ { KERN_CORE_PATTERN, "core_pattern" },
+ { KERN_PANIC_ON_OOPS, "panic_on_oops" },
+ { KERN_HPPA_PWRSW, "soft-power" },
+ { KERN_HPPA_UNALIGNED, "unaligned-trap" },
+
+ { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
+ { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
+
+ { KERN_PTY, "pty", trans_pty_table },
+ { KERN_NGROUPS_MAX, "ngroups_max" },
+ { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
+ { KERN_HZ_TIMER, "hz_timer" },
+ { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
+ { KERN_BOOTLOADER_TYPE, "bootloader_type" },
+ { KERN_RANDOMIZE, "randomize_va_space" },
+
+ { KERN_SPIN_RETRY, "spin_retry" },
+ { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
+ { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
+ { KERN_COMPAT_LOG, "compat-log" },
+ { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
+ { KERN_NMI_WATCHDOG, "nmi_watchdog" },
+ { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ {}
+};
+
+static const struct trans_ctl_table trans_vm_table[] = {
+ { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
+ { VM_PAGE_CLUSTER, "page-cluster" },
+ { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
+ { VM_DIRTY_RATIO, "dirty_ratio" },
+ { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
+ { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
+ { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
+ { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
+ /* VM_PAGEBUF unused */
+ { VM_HUGETLB_PAGES, "nr_hugepages" },
+ { VM_SWAPPINESS, "swappiness" },
+ { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
+ { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
+ { VM_MAX_MAP_COUNT, "max_map_count" },
+ { VM_LAPTOP_MODE, "laptop_mode" },
+ { VM_BLOCK_DUMP, "block_dump" },
+ { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
+ { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
+ { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
+ /* VM_SWAP_TOKEN_TIMEOUT unused */
+ { VM_DROP_PAGECACHE, "drop_caches" },
+ { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
+ { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
+ { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
+ { VM_PANIC_ON_OOM, "panic_on_oom" },
+ { VM_VDSO_ENABLED, "vdso_enabled" },
+ { VM_MIN_SLAB, "min_slab_ratio" },
+
+ {}
+};
+
+static const struct trans_ctl_table trans_net_core_table[] = {
+ { NET_CORE_WMEM_MAX, "wmem_max" },
+ { NET_CORE_RMEM_MAX, "rmem_max" },
+ { NET_CORE_WMEM_DEFAULT, "wmem_default" },
+ { NET_CORE_RMEM_DEFAULT, "rmem_default" },
+ /* NET_CORE_DESTROY_DELAY unused */
+ { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
+ /* NET_CORE_FASTROUTE unused */
+ { NET_CORE_MSG_COST, "message_cost" },
+ { NET_CORE_MSG_BURST, "message_burst" },
+ { NET_CORE_OPTMEM_MAX, "optmem_max" },
+ /* NET_CORE_HOT_LIST_LENGTH unused */
+ /* NET_CORE_DIVERT_VERSION unused */
+ /* NET_CORE_NO_CONG_THRESH unused */
+ /* NET_CORE_NO_CONG unused */
+ /* NET_CORE_LO_CONG unused */
+ /* NET_CORE_MOD_CONG unused */
+ { NET_CORE_DEV_WEIGHT, "dev_weight" },
+ { NET_CORE_SOMAXCONN, "somaxconn" },
+ { NET_CORE_BUDGET, "netdev_budget" },
+ { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
+ { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
+ { NET_CORE_WARNINGS, "warnings" },
+ {},
+};
+
+static const struct trans_ctl_table trans_net_unix_table[] = {
+ /* NET_UNIX_DESTROY_DELAY unused */
+ /* NET_UNIX_DELETE_DELAY unused */
+ { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
+ { NET_IPV4_ROUTE_FLUSH, "flush" },
+ { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
+ { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
+ { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
+ { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
+ { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
+ { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
+ { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
+ { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
+ { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
+ { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
+ { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
+ { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
+ { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
+ { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
+ { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
+ { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
+ { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
+ { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
+ { NET_IPV4_CONF_FORWARDING, "forwarding" },
+ { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
+
+ { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
+ { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
+ { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
+ { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
+ { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
+ { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
+ { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
+ { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
+ { NET_IPV4_CONF_TAG, "tag" },
+ { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
+ { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
+ { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
+ { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
+ { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
+
+ { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
+ { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
+ { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
+ { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
+ { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
+ { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
+ { 0, NULL, trans_net_ipv4_conf_vars_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
+ { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
+ { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
+ { NET_NEIGH_APP_SOLICIT, "app_solicit" },
+ { NET_NEIGH_RETRANS_TIME, "retrans_time" },
+ { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
+ { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
+ { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
+ { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
+ { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
+ { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
+ { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
+ { NET_NEIGH_LOCKTIME, "locktime" },
+ { NET_NEIGH_GC_INTERVAL, "gc_interval" },
+ { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
+ { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
+ { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
+ { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
+ { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_neigh_table[] = {
+ { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
+ { 0, NULL, trans_net_neigh_vars_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
+ { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
+
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
+
+ { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
+ { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
+ { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
+ { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
+
+ { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
+ { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
+ { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
+ { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
+ { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
+ { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
+
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
+ { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
+
+ { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
+ { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv4_table[] = {
+ { NET_IPV4_FORWARD, "ip_forward" },
+ { NET_IPV4_DYNADDR, "ip_dynaddr" },
+
+ { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
+ { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
+ { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
+ /* NET_IPV4_FIB_HASH unused */
+ { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
+
+ { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
+ { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
+ { NET_IPV4_TCP_SACK, "tcp_sack" },
+ { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
+ { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
+ /* NET_IPV4_AUTOCONFIG unused */
+ { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
+ { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
+ { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
+ { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
+ { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
+ /* NET_IPV4_TCP_MAX_KA_PROBES unused */
+ { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
+ { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
+ { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
+ { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
+ { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
+ /* NET_IPV4_IP_MASQ_DEBUG unused */
+ { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
+ { NET_TCP_STDURG, "tcp_stdurg" },
+ { NET_TCP_RFC1337, "tcp_rfc1337" },
+ /* NET_TCP_SYN_TAILDROP unused */
+ { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
+ { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
+ { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
+ { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
+ /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
+ /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
+ /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
+ /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
+ /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
+ { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
+ { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
+ { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
+ /* NET_IPV4_ALWAYS_DEFRAG unused */
+ { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
+ { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
+ { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
+ { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
+ { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
+ { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
+ { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
+ { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
+ { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
+ { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
+ { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
+ { NET_TCP_FACK, "tcp_fack" },
+ { NET_TCP_REORDERING, "tcp_reordering" },
+ { NET_TCP_ECN, "tcp_ecn" },
+ { NET_TCP_DSACK, "tcp_dsack" },
+ { NET_TCP_MEM, "tcp_mem" },
+ { NET_TCP_WMEM, "tcp_wmem" },
+ { NET_TCP_RMEM, "tcp_rmem" },
+ { NET_TCP_APP_WIN, "tcp_app_win" },
+ { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
+ { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
+ { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
+ { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
+ { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
+ { NET_TCP_FRTO, "tcp_frto" },
+ { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
+ { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
+ { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
+ { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
+ /* NET_TCP_DEFAULT_WIN_SCALE unused */
+ { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
+ { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
+ /* NET_TCP_BIC_BETA unused */
+ { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
+ { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
+ { NET_TCP_ABC, "tcp_abc" },
+ { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
+ { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
+ { NET_TCP_BASE_MSS, "tcp_base_mss" },
+ { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
+ { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
+ { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
+ { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
+ { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
+ { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
+ { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
+ { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
+ { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
+ { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
+ { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
+ { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipx_table[] = {
+ { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
+ /* NET_IPX_FORWARDING unused */
+ {}
+};
+
+static const struct trans_ctl_table trans_net_atalk_table[] = {
+ { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
+ { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
+ { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
+ { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
+ {},
+};
+
+static const struct trans_ctl_table trans_net_netrom_table[] = {
+ { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
+ { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
+ { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
+ { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
+ { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
+ { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
+ { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
+ { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
+ { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
+ { NET_NETROM_ROUTING_CONTROL, "routing_control" },
+ { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
+ { NET_NETROM_RESET, "reset" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ax25_param_table[] = {
+ { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
+ { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
+ { NET_AX25_BACKOFF_TYPE, "backoff_type" },
+ { NET_AX25_CONNECT_MODE, "connect_mode" },
+ { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
+ { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
+ { NET_AX25_T1_TIMEOUT, "t1_timeout" },
+ { NET_AX25_T2_TIMEOUT, "t2_timeout" },
+ { NET_AX25_T3_TIMEOUT, "t3_timeout" },
+ { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
+ { NET_AX25_N2, "maximum_retry_count" },
+ { NET_AX25_PACLEN, "maximum_packet_length" },
+ { NET_AX25_PROTOCOL, "protocol" },
+ { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ax25_table[] = {
+ { 0, NULL, trans_net_ax25_param_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_bridge_table[] = {
+ { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
+ { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
+ { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
+ { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
+ { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_rose_table[] = {
+ { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
+ { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
+ { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
+ { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
+ { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
+ { NET_ROSE_ROUTING_CONTROL, "routing_control" },
+ { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
+ { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
+ { NET_ROSE_WINDOW_SIZE, "window_size" },
+ { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
+ { NET_IPV6_FORWARDING, "forwarding" },
+ { NET_IPV6_HOP_LIMIT, "hop_limit" },
+ { NET_IPV6_MTU, "mtu" },
+ { NET_IPV6_ACCEPT_RA, "accept_ra" },
+ { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
+ { NET_IPV6_AUTOCONF, "autoconf" },
+ { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
+ { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
+ { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
+ { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
+ { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
+ { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
+ { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
+ { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
+ { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
+ { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
+ { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
+ { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
+ { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
+ { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
+ { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
+ { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
+ { NET_IPV6_PROXY_NDP, "proxy_ndp" },
+ { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
+ { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
+ { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
+ { 0, NULL, trans_net_ipv6_conf_var_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
+ { NET_IPV6_ROUTE_FLUSH, "flush" },
+ { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
+ { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
+ { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
+ { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
+ { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
+ { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
+ { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
+ { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
+ { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
+ { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_ipv6_table[] = {
+ { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
+ { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
+ { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
+ { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
+ { NET_IPV6_BINDV6ONLY, "bindv6only" },
+ { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
+ { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
+ { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
+ { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
+ { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
+ { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_x25_table[] = {
+ { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
+ { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
+ { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
+ { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
+ { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
+ { NET_X25_FORWARD, "x25_forward" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_tr_table[] = {
+ { NET_TR_RIF_TIMEOUT, "rif_timeout" },
+ {}
+};
+
+
+static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
+ { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
+ { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
+ { NET_DECNET_CONF_DEV_T2, "t2" },
+ { NET_DECNET_CONF_DEV_T3, "t3" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_decnet_conf[] = {
+ { 0, NULL, trans_net_decnet_conf_vars },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_decnet_table[] = {
+ { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
+ { NET_DECNET_NODE_ADDRESS, "node_address" },
+ { NET_DECNET_NODE_NAME, "node_name" },
+ { NET_DECNET_DEFAULT_DEVICE, "default_device" },
+ { NET_DECNET_TIME_WAIT, "time_wait" },
+ { NET_DECNET_DN_COUNT, "dn_count" },
+ { NET_DECNET_DI_COUNT, "di_count" },
+ { NET_DECNET_DR_COUNT, "dr_count" },
+ { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
+ { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
+ { NET_DECNET_MEM, "decnet_mem" },
+ { NET_DECNET_RMEM, "decnet_rmem" },
+ { NET_DECNET_WMEM, "decnet_wmem" },
+ { NET_DECNET_DEBUG_LEVEL, "debug" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_sctp_table[] = {
+ { NET_SCTP_RTO_INITIAL, "rto_initial" },
+ { NET_SCTP_RTO_MIN, "rto_min" },
+ { NET_SCTP_RTO_MAX, "rto_max" },
+ { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
+ { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
+ { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
+ { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
+ { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
+ { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
+ { NET_SCTP_HB_INTERVAL, "hb_interval" },
+ { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
+ { NET_SCTP_MAX_BURST, "max_burst" },
+ { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
+ { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
+ { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
+ { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
+ { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
+ { NET_LLC2_ACK_TIMEOUT, "ack" },
+ { NET_LLC2_P_TIMEOUT, "p" },
+ { NET_LLC2_REJ_TIMEOUT, "rej" },
+ { NET_LLC2_BUSY_TIMEOUT, "busy" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_llc_station_table[] = {
+ { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
+ { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_llc_table[] = {
+ { NET_LLC2, "llc2", trans_net_llc_llc2_table },
+ { NET_LLC_STATION, "station", trans_net_llc_station_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_netfilter_table[] = {
+ { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
+ { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
+ { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
+ { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
+ { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
+ { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
+ { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
+ { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
+ { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
+ { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
+ { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
+ { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
+ { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
+ { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
+ { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
+ { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
+ { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
+ { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
+
+ {}
+};
+
+static const struct trans_ctl_table trans_net_dccp_table[] = {
+ { NET_DCCP_DEFAULT, "default" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_irda_table[] = {
+ { NET_IRDA_DISCOVERY, "discovery" },
+ { NET_IRDA_DEVNAME, "devname" },
+ { NET_IRDA_DEBUG, "debug" },
+ { NET_IRDA_FAST_POLL, "fast_poll_increase" },
+ { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
+ { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
+ { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
+ { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
+ { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
+ { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
+ { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
+ { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
+ { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
+ { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
+ {}
+};
+
+static const struct trans_ctl_table trans_net_table[] = {
+ { NET_CORE, "core", trans_net_core_table },
+ /* NET_ETHER not used */
+ /* NET_802 not used */
+ { NET_UNIX, "unix", trans_net_unix_table },
+ { NET_IPV4, "ipv4", trans_net_ipv4_table },
+ { NET_IPX, "ipx", trans_net_ipx_table },
+ { NET_ATALK, "appletalk", trans_net_atalk_table },
+ { NET_NETROM, "netrom", trans_net_netrom_table },
+ { NET_AX25, "ax25", trans_net_ax25_table },
+ { NET_BRIDGE, "bridge", trans_net_bridge_table },
+ { NET_ROSE, "rose", trans_net_rose_table },
+ { NET_IPV6, "ipv6", trans_net_ipv6_table },
+ { NET_X25, "x25", trans_net_x25_table },
+ { NET_TR, "token-ring", trans_net_tr_table },
+ { NET_DECNET, "decnet", trans_net_decnet_table },
+ /* NET_ECONET not used */
+ { NET_SCTP, "sctp", trans_net_sctp_table },
+ { NET_LLC, "llc", trans_net_llc_table },
+ { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
+ { NET_DCCP, "dccp", trans_net_dccp_table },
+ { NET_IRDA, "irda", trans_net_irda_table },
+ { 2089, "nf_conntrack_max" },
+ {}
+};
+
+static const struct trans_ctl_table trans_fs_quota_table[] = {
+ { FS_DQ_LOOKUPS, "lookups" },
+ { FS_DQ_DROPS, "drops" },
+ { FS_DQ_READS, "reads" },
+ { FS_DQ_WRITES, "writes" },
+ { FS_DQ_CACHE_HITS, "cache_hits" },
+ { FS_DQ_ALLOCATED, "allocated_dquots" },
+ { FS_DQ_FREE, "free_dquots" },
+ { FS_DQ_SYNCS, "syncs" },
+ { FS_DQ_WARNINGS, "warnings" },
+ {}
+};
+
+static const struct trans_ctl_table trans_fs_xfs_table[] = {
+ { XFS_RESTRICT_CHOWN, "restrict_chown" },
+ { XFS_SGID_INHERIT, "irix_sgid_inherit" },
+ { XFS_SYMLINK_MODE, "irix_symlink_mode" },
+ { XFS_PANIC_MASK, "panic_mask" },
+
+ { XFS_ERRLEVEL, "error_level" },
+ { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
+ { XFS_INHERIT_SYNC, "inherit_sync" },
+ { XFS_INHERIT_NODUMP, "inherit_nodump" },
+ { XFS_INHERIT_NOATIME, "inherit_noatime" },
+ { XFS_BUF_TIMER, "xfsbufd_centisecs" },
+ { XFS_BUF_AGE, "age_buffer_centisecs" },
+ { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
+ { XFS_ROTORSTEP, "rotorstep" },
+ { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
+ { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
+ { XFS_STATS_CLEAR, "stats_clear" },
+ {}
+};
+
+static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
+ { 1, "hb_ctl_path" },
+ {}
+};
+
+static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
+ { 1, "nm", trans_fs_ocfs2_nm_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_inotify_table[] = {
+ { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
+ { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
+ { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
+ {}
+};
+
+static const struct trans_ctl_table trans_fs_table[] = {
+ { FS_NRINODE, "inode-nr" },
+ { FS_STATINODE, "inode-state" },
+ /* FS_MAXINODE unused */
+ /* FS_NRDQUOT unused */
+ /* FS_MAXDQUOT unused */
+ { FS_NRFILE, "file-nr" },
+ { FS_MAXFILE, "file-max" },
+ { FS_DENTRY, "dentry-state" },
+ /* FS_NRSUPER unused */
+ /* FS_MAXUPSER unused */
+ { FS_OVERFLOWUID, "overflowuid" },
+ { FS_OVERFLOWGID, "overflowgid" },
+ { FS_LEASES, "leases-enable" },
+ { FS_DIR_NOTIFY, "dir-notify-enable" },
+ { FS_LEASE_TIME, "lease-break-time" },
+ { FS_DQSTATS, "quota", trans_fs_quota_table },
+ { FS_XFS, "xfs", trans_fs_xfs_table },
+ { FS_AIO_NR, "aio-nr" },
+ { FS_AIO_MAX_NR, "aio-max-nr" },
+ { FS_INOTIFY, "inotify", trans_inotify_table },
+ { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
+ { KERN_SETUID_DUMPABLE, "suid_dumpable" },
+ {}
+};
+
+static const struct trans_ctl_table trans_debug_table[] = {
+ {}
+};
+
+static const struct trans_ctl_table trans_cdrom_table[] = {
+ { DEV_CDROM_INFO, "info" },
+ { DEV_CDROM_AUTOCLOSE, "autoclose" },
+ { DEV_CDROM_AUTOEJECT, "autoeject" },
+ { DEV_CDROM_DEBUG, "debug" },
+ { DEV_CDROM_LOCK, "lock" },
+ { DEV_CDROM_CHECK_MEDIA, "check_media" },
+ {}
+};
+
+static const struct trans_ctl_table trans_ipmi_table[] = {
+ { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
+ {}
+};
+
+static const struct trans_ctl_table trans_mac_hid_files[] = {
+ /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
+ /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
+ { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
+ { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
+ { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
+ /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
+ {}
+};
+
+static const struct trans_ctl_table trans_raid_table[] = {
+ { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
+ { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
+ {}
+};
+
+static const struct trans_ctl_table trans_scsi_table[] = {
+ { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
+ {}
+};
+
+static const struct trans_ctl_table trans_parport_default_table[] = {
+ { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
+ { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
+ {}
+};
+
+static const struct trans_ctl_table trans_parport_device_table[] = {
+ { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
+ {}
+};
+
+static const struct trans_ctl_table trans_parport_devices_table[] = {
+ { DEV_PARPORT_DEVICES_ACTIVE, "active" },
+ { 0, NULL, trans_parport_device_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_parport_parport_table[] = {
+ { DEV_PARPORT_SPINTIME, "spintime" },
+ { DEV_PARPORT_BASE_ADDR, "base-addr" },
+ { DEV_PARPORT_IRQ, "irq" },
+ { DEV_PARPORT_DMA, "dma" },
+ { DEV_PARPORT_MODES, "modes" },
+ { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
+ { DEV_PARPORT_AUTOPROBE, "autoprobe" },
+ { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
+ { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
+ { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
+ { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
+ {}
+};
+static const struct trans_ctl_table trans_parport_table[] = {
+ { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
+ { 0, NULL, trans_parport_parport_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_dev_table[] = {
+ { DEV_CDROM, "cdrom", trans_cdrom_table },
+ /* DEV_HWMON unused */
+ { DEV_PARPORT, "parport", trans_parport_table },
+ { DEV_RAID, "raid", trans_raid_table },
+ { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
+ { DEV_SCSI, "scsi", trans_scsi_table },
+ { DEV_IPMI, "ipmi", trans_ipmi_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_bus_isa_table[] = {
+ { BUS_ISA_MEM_BASE, "membase" },
+ { BUS_ISA_PORT_BASE, "portbase" },
+ { BUS_ISA_PORT_SHIFT, "portshift" },
+ {}
+};
+
+static const struct trans_ctl_table trans_bus_table[] = {
+ { CTL_BUS_ISA, "isa", trans_bus_isa_table },
+ {}
+};
+
+static const struct trans_ctl_table trans_arlan_conf_table0[] = {
+ { 1, "spreadingCode" },
+ { 2, "channelNumber" },
+ { 3, "scramblingDisable" },
+ { 4, "txAttenuation" },
+ { 5, "systemId" },
+ { 6, "maxDatagramSize" },
+ { 7, "maxFrameSize" },
+ { 8, "maxRetries" },
+ { 9, "receiveMode" },
+ { 10, "priority" },
+ { 11, "rootOrRepeater" },
+ { 12, "SID" },
+ { 13, "registrationMode" },
+ { 14, "registrationFill" },
+ { 15, "localTalkAddress" },
+ { 16, "codeFormat" },
+ { 17, "numChannels" },
+ { 18, "channel1" },
+ { 19, "channel2" },
+ { 20, "channel3" },
+ { 21, "channel4" },
+ { 22, "txClear" },
+ { 23, "txRetries" },
+ { 24, "txRouting" },
+ { 25, "txScrambled" },
+ { 26, "rxParameter" },
+ { 27, "txTimeoutMs" },
+ { 28, "waitCardTimeout" },
+ { 29, "channelSet" },
+ { 30, "name" },
+ { 31, "waitTime" },
+ { 32, "lParameter" },
+ { 33, "_15" },
+ { 34, "headerSize" },
+ { 36, "tx_delay_ms" },
+ { 37, "retries" },
+ { 38, "ReTransmitPacketMaxSize" },
+ { 39, "waitReTransmitPacketMaxSize" },
+ { 40, "fastReTransCount" },
+ { 41, "driverRetransmissions" },
+ { 42, "txAckTimeoutMs" },
+ { 43, "registrationInterrupts" },
+ { 44, "hardwareType" },
+ { 45, "radioType" },
+ { 46, "writeEEPROM" },
+ { 47, "writeRadioType" },
+ { 48, "entry_exit_debug" },
+ { 49, "debug" },
+ { 50, "in_speed" },
+ { 51, "out_speed" },
+ { 52, "in_speed10" },
+ { 53, "out_speed10" },
+ { 54, "in_speed_max" },
+ { 55, "out_speed_max" },
+ { 56, "measure_rate" },
+ { 57, "pre_Command_Wait" },
+ { 58, "rx_tweak1" },
+ { 59, "rx_tweak2" },
+ { 60, "tx_queue_len" },
+
+ { 150, "arlan0-txRing" },
+ { 151, "arlan0-rxRing" },
+ { 152, "arlan0-18" },
+ { 153, "arlan0-ring" },
+ { 154, "arlan0-shm-cpy" },
+ { 155, "config0" },
+ { 156, "reset0" },
+ {}
+};
+
+static const struct trans_ctl_table trans_arlan_conf_table1[] = {
+ { 1, "spreadingCode" },
+ { 2, "channelNumber" },
+ { 3, "scramblingDisable" },
+ { 4, "txAttenuation" },
+ { 5, "systemId" },
+ { 6, "maxDatagramSize" },
+ { 7, "maxFrameSize" },
+ { 8, "maxRetries" },
+ { 9, "receiveMode" },
+ { 10, "priority" },
+ { 11, "rootOrRepeater" },
+ { 12, "SID" },
+ { 13, "registrationMode" },
+ { 14, "registrationFill" },
+ { 15, "localTalkAddress" },
+ { 16, "codeFormat" },
+ { 17, "numChannels" },
+ { 18, "channel1" },
+ { 19, "channel2" },
+ { 20, "channel3" },
+ { 21, "channel4" },
+ { 22, "txClear" },
+ { 23, "txRetries" },
+ { 24, "txRouting" },
+ { 25, "txScrambled" },
+ { 26, "rxParameter" },
+ { 27, "txTimeoutMs" },
+ { 28, "waitCardTimeout" },
+ { 29, "channelSet" },
+ { 30, "name" },
+ { 31, "waitTime" },
+ { 32, "lParameter" },
+ { 33, "_15" },
+ { 34, "headerSize" },
+ { 36, "tx_delay_ms" },
+ { 37, "retries" },
+ { 38, "ReTransmitPacketMaxSize" },
+ { 39, "waitReTransmitPacketMaxSize" },
+ { 40, "fastReTransCount" },
+ { 41, "driverRetransmissions" },
+ { 42, "txAckTimeoutMs" },
+ { 43, "registrationInterrupts" },
+ { 44, "hardwareType" },
+ { 45, "radioType" },
+ { 46, "writeEEPROM" },
+ { 47, "writeRadioType" },
+ { 48, "entry_exit_debug" },
+ { 49, "debug" },
+ { 50, "in_speed" },
+ { 51, "out_speed" },
+ { 52, "in_speed10" },
+ { 53, "out_speed10" },
+ { 54, "in_speed_max" },
+ { 55, "out_speed_max" },
+ { 56, "measure_rate" },
+ { 57, "pre_Command_Wait" },
+ { 58, "rx_tweak1" },
+ { 59, "rx_tweak2" },
+ { 60, "tx_queue_len" },
+
+ { 150, "arlan1-txRing" },
+ { 151, "arlan1-rxRing" },
+ { 152, "arlan1-18" },
+ { 153, "arlan1-ring" },
+ { 154, "arlan1-shm-cpy" },
+ { 155, "config1" },
+ { 156, "reset1" },
+ {}
+};
+
+static const struct trans_ctl_table trans_arlan_conf_table2[] = {
+ { 1, "spreadingCode" },
+ { 2, "channelNumber" },
+ { 3, "scramblingDisable" },
+ { 4, "txAttenuation" },
+ { 5, "systemId" },
+ { 6, "maxDatagramSize" },
+ { 7, "maxFrameSize" },
+ { 8, "maxRetries" },
+ { 9, "receiveMode" },
+ { 10, "priority" },
+ { 11, "rootOrRepeater" },
+ { 12, "SID" },
+ { 13, "registrationMode" },
+ { 14, "registrationFill" },
+ { 15, "localTalkAddress" },
+ { 16, "codeFormat" },
+ { 17, "numChannels" },
+ { 18, "channel1" },
+ { 19, "channel2" },
+ { 20, "channel3" },
+ { 21, "channel4" },
+ { 22, "txClear" },
+ { 23, "txRetries" },
+ { 24, "txRouting" },
+ { 25, "txScrambled" },
+ { 26, "rxParameter" },
+ { 27, "txTimeoutMs" },
+ { 28, "waitCardTimeout" },
+ { 29, "channelSet" },
+ { 30, "name" },
+ { 31, "waitTime" },
+ { 32, "lParameter" },
+ { 33, "_15" },
+ { 34, "headerSize" },
+ { 36, "tx_delay_ms" },
+ { 37, "retries" },
+ { 38, "ReTransmitPacketMaxSize" },
+ { 39, "waitReTransmitPacketMaxSize" },
+ { 40, "fastReTransCount" },
+ { 41, "driverRetransmissions" },
+ { 42, "txAckTimeoutMs" },
+ { 43, "registrationInterrupts" },
+ { 44, "hardwareType" },
+ { 45, "radioType" },
+ { 46, "writeEEPROM" },
+ { 47, "writeRadioType" },
+ { 48, "entry_exit_debug" },
+ { 49, "debug" },
+ { 50, "in_speed" },
+ { 51, "out_speed" },
+ { 52, "in_speed10" },
+ { 53, "out_speed10" },
+ { 54, "in_speed_max" },
+ { 55, "out_speed_max" },
+ { 56, "measure_rate" },
+ { 57, "pre_Command_Wait" },
+ { 58, "rx_tweak1" },
+ { 59, "rx_tweak2" },
+ { 60, "tx_queue_len" },
+
+ { 150, "arlan2-txRing" },
+ { 151, "arlan2-rxRing" },
+ { 152, "arlan2-18" },
+ { 153, "arlan2-ring" },
+ { 154, "arlan2-shm-cpy" },
+ { 155, "config2" },
+ { 156, "reset2" },
+ {}
+};
+
+static const struct trans_ctl_table trans_arlan_conf_table3[] = {
+ { 1, "spreadingCode" },
+ { 2, "channelNumber" },
+ { 3, "scramblingDisable" },
+ { 4, "txAttenuation" },
+ { 5, "systemId" },
+ { 6, "maxDatagramSize" },
+ { 7, "maxFrameSize" },
+ { 8, "maxRetries" },
+ { 9, "receiveMode" },
+ { 10, "priority" },
+ { 11, "rootOrRepeater" },
+ { 12, "SID" },
+ { 13, "registrationMode" },
+ { 14, "registrationFill" },
+ { 15, "localTalkAddress" },
+ { 16, "codeFormat" },
+ { 17, "numChannels" },
+ { 18, "channel1" },
+ { 19, "channel2" },
+ { 20, "channel3" },
+ { 21, "channel4" },
+ { 22, "txClear" },
+ { 23, "txRetries" },
+ { 24, "txRouting" },
+ { 25, "txScrambled" },
+ { 26, "rxParameter" },
+ { 27, "txTimeoutMs" },
+ { 28, "waitCardTimeout" },
+ { 29, "channelSet" },
+ { 30, "name" },
+ { 31, "waitTime" },
+ { 32, "lParameter" },
+ { 33, "_15" },
+ { 34, "headerSize" },
+ { 36, "tx_delay_ms" },
+ { 37, "retries" },
+ { 38, "ReTransmitPacketMaxSize" },
+ { 39, "waitReTransmitPacketMaxSize" },
+ { 40, "fastReTransCount" },
+ { 41, "driverRetransmissions" },
+ { 42, "txAckTimeoutMs" },
+ { 43, "registrationInterrupts" },
+ { 44, "hardwareType" },
+ { 45, "radioType" },
+ { 46, "writeEEPROM" },
+ { 47, "writeRadioType" },
+ { 48, "entry_exit_debug" },
+ { 49, "debug" },
+ { 50, "in_speed" },
+ { 51, "out_speed" },
+ { 52, "in_speed10" },
+ { 53, "out_speed10" },
+ { 54, "in_speed_max" },
+ { 55, "out_speed_max" },
+ { 56, "measure_rate" },
+ { 57, "pre_Command_Wait" },
+ { 58, "rx_tweak1" },
+ { 59, "rx_tweak2" },
+ { 60, "tx_queue_len" },
+
+ { 150, "arlan3-txRing" },
+ { 151, "arlan3-rxRing" },
+ { 152, "arlan3-18" },
+ { 153, "arlan3-ring" },
+ { 154, "arlan3-shm-cpy" },
+ { 155, "config3" },
+ { 156, "reset3" },
+ {}
+};
+
+static const struct trans_ctl_table trans_arlan_table[] = {
+ { 1, "arlan0", trans_arlan_conf_table0 },
+ { 2, "arlan1", trans_arlan_conf_table1 },
+ { 3, "arlan2", trans_arlan_conf_table2 },
+ { 4, "arlan3", trans_arlan_conf_table3 },
+ {}
+};
+
+static const struct trans_ctl_table trans_s390dbf_table[] = {
+ { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
+ { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
+ {}
+};
+
+static const struct trans_ctl_table trans_sunrpc_table[] = {
+ { CTL_RPCDEBUG, "rpc_debug" },
+ { CTL_NFSDEBUG, "nfs_debug" },
+ { CTL_NFSDDEBUG, "nfsd_debug" },
+ { CTL_NLMDEBUG, "nlm_debug" },
+ { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
+ { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
+ { CTL_MIN_RESVPORT, "min_resvport" },
+ { CTL_MAX_RESVPORT, "max_resvport" },
+ {}
+};
+
+static const struct trans_ctl_table trans_pm_table[] = {
+ { 1 /* CTL_PM_SUSPEND */, "suspend" },
+ { 2 /* CTL_PM_CMODE */, "cmode" },
+ { 3 /* CTL_PM_P0 */, "p0" },
+ { 4 /* CTL_PM_CM */, "cm" },
+ {}
+};
+
+static const struct trans_ctl_table trans_frv_table[] = {
+ { 1, "cache-mode" },
+ { 2, "pin-cxnr" },
+ {}
+};
+
+static const struct trans_ctl_table trans_root_table[] = {
+ { CTL_KERN, "kernel", trans_kern_table },
+ { CTL_VM, "vm", trans_vm_table },
+ { CTL_NET, "net", trans_net_table },
+ /* CTL_PROC not used */
+ { CTL_FS, "fs", trans_fs_table },
+ { CTL_DEBUG, "debug", trans_debug_table },
+ { CTL_DEV, "dev", trans_dev_table },
+ { CTL_BUS, "bus", trans_bus_table },
+ { CTL_ABI, "abi" },
+ /* CTL_CPU not used */
+ { CTL_ARLAN, "arlan", trans_arlan_table },
+ { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
+ { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
+ { CTL_PM, "pm", trans_pm_table },
+ { CTL_FRV, "frv", trans_frv_table },
+ {}
+};
+
+
+
+
+static int sysctl_depth(struct ctl_table *table)
+{
+ struct ctl_table *tmp;
+ int depth;
+
+ depth = 0;
+ for (tmp = table; tmp->parent; tmp = tmp->parent)
+ depth++;
+
+ return depth;
+}
+
+static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
+{
+ int i;
+
+ for (i = 0; table && i < n; i++)
+ table = table->parent;
+
+ return table;
+}
+
+static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
+{
+ struct ctl_table *test;
+ const struct trans_ctl_table *ref;
+ int cur_depth;
+
+ cur_depth = sysctl_depth(table);
+
+ ref = trans_root_table;
+repeat:
+ test = sysctl_parent(table, cur_depth);
+ for (; ref->ctl_name || ref->procname || ref->child; ref++) {
+ int match = 0;
+
+ if (cur_depth && !ref->child)
+ continue;
+
+ if (test->procname && ref->procname &&
+ (strcmp(test->procname, ref->procname) == 0))
+ match++;
+
+ if (test->ctl_name && ref->ctl_name &&
+ (test->ctl_name == ref->ctl_name))
+ match++;
+
+ if (!ref->ctl_name && !ref->procname)
+ match++;
+
+ if (match) {
+ if (cur_depth != 0) {
+ cur_depth--;
+ ref = ref->child;
+ goto repeat;
+ }
+ goto out;
+ }
+ }
+ ref = NULL;
+out:
+ return ref;
+}
+
+static void sysctl_print_path(struct ctl_table *table)
+{
+ struct ctl_table *tmp;
+ int depth, i;
+ depth = sysctl_depth(table);
+ if (table->procname) {
+ for (i = depth; i >= 0; i--) {
+ tmp = sysctl_parent(table, i);
+ printk("/%s", tmp->procname?tmp->procname:"");
+ }
+ }
+ printk(" ");
+ if (table->ctl_name) {
+ for (i = depth; i >= 0; i--) {
+ tmp = sysctl_parent(table, i);
+ printk(".%d", tmp->ctl_name);
+ }
+ }
+}
+
+static void sysctl_repair_table(struct ctl_table *table)
+{
+ /* Don't complain about the classic default
+ * sysctl strategy routine. Maybe later we
+ * can get the tables fixed and complain about
+ * this.
+ */
+ if (table->ctl_name && table->procname &&
+ (table->proc_handler == proc_dointvec) &&
+ (!table->strategy)) {
+ table->strategy = sysctl_data;
+ }
+}
+
+static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
+ struct ctl_table *table)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *ref, *test;
+ int depth, cur_depth;
+
+ depth = sysctl_depth(table);
+
+ for (head = __sysctl_head_next(namespaces, NULL); head;
+ head = __sysctl_head_next(namespaces, head)) {
+ cur_depth = depth;
+ ref = head->ctl_table;
+repeat:
+ test = sysctl_parent(table, cur_depth);
+ for (; ref->ctl_name || ref->procname; ref++) {
+ int match = 0;
+ if (cur_depth && !ref->child)
+ continue;
+
+ if (test->procname && ref->procname &&
+ (strcmp(test->procname, ref->procname) == 0))
+ match++;
+
+ if (test->ctl_name && ref->ctl_name &&
+ (test->ctl_name == ref->ctl_name))
+ match++;
+
+ if (match) {
+ if (cur_depth != 0) {
+ cur_depth--;
+ ref = ref->child;
+ goto repeat;
+ }
+ goto out;
+ }
+ }
+ }
+ ref = NULL;
+out:
+ sysctl_head_finish(head);
+ return ref;
+}
+
+static void set_fail(const char **fail, struct ctl_table *table, const char *str)
+{
+ if (*fail) {
+ printk(KERN_ERR "sysctl table check failed: ");
+ sysctl_print_path(table);
+ printk(" %s\n", *fail);
+ dump_stack();
+ }
+ *fail = str;
+}
+
+static int sysctl_check_dir(struct nsproxy *namespaces,
+ struct ctl_table *table)
+{
+ struct ctl_table *ref;
+ int error;
+
+ error = 0;
+ ref = sysctl_check_lookup(namespaces, table);
+ if (ref) {
+ int match = 0;
+ if ((!table->procname && !ref->procname) ||
+ (table->procname && ref->procname &&
+ (strcmp(table->procname, ref->procname) == 0)))
+ match++;
+
+ if ((!table->ctl_name && !ref->ctl_name) ||
+ (table->ctl_name && ref->ctl_name &&
+ (table->ctl_name == ref->ctl_name)))
+ match++;
+
+ if (match != 2) {
+ printk(KERN_ERR "%s: failed: ", __func__);
+ sysctl_print_path(table);
+ printk(" ref: ");
+ sysctl_print_path(ref);
+ printk("\n");
+ error = -EINVAL;
+ }
+ }
+ return error;
+}
+
+static void sysctl_check_leaf(struct nsproxy *namespaces,
+ struct ctl_table *table, const char **fail)
+{
+ struct ctl_table *ref;
+
+ ref = sysctl_check_lookup(namespaces, table);
+ if (ref && (ref != table))
+ set_fail(fail, table, "Sysctl already exists");
+}
+
+static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
+{
+ const struct trans_ctl_table *ref;
+
+ ref = sysctl_binary_lookup(table);
+ if (table->ctl_name && !ref)
+ set_fail(fail, table, "Unknown sysctl binary path");
+ if (ref) {
+ if (ref->procname &&
+ (!table->procname ||
+ (strcmp(table->procname, ref->procname) != 0)))
+ set_fail(fail, table, "procname does not match binary path procname");
+
+ if (ref->ctl_name && table->ctl_name &&
+ (table->ctl_name != ref->ctl_name))
+ set_fail(fail, table, "ctl_name does not match binary path ctl_name");
+ }
+}
+
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+{
+ int error = 0;
+ for (; table->ctl_name || table->procname; table++) {
+ const char *fail = NULL;
+
+ sysctl_repair_table(table);
+ if (table->parent) {
+ if (table->procname && !table->parent->procname)
+ set_fail(&fail, table, "Parent without procname");
+ if (table->ctl_name && !table->parent->ctl_name)
+ set_fail(&fail, table, "Parent without ctl_name");
+ }
+ if (!table->procname)
+ set_fail(&fail, table, "No procname");
+ if (table->child) {
+ if (table->data)
+ set_fail(&fail, table, "Directory with data?");
+ if (table->maxlen)
+ set_fail(&fail, table, "Directory with maxlen?");
+ if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
+ set_fail(&fail, table, "Writable sysctl directory");
+ if (table->proc_handler)
+ set_fail(&fail, table, "Directory with proc_handler");
+ if (table->strategy)
+ set_fail(&fail, table, "Directory with strategy");
+ if (table->extra1)
+ set_fail(&fail, table, "Directory with extra1");
+ if (table->extra2)
+ set_fail(&fail, table, "Directory with extra2");
+ if (sysctl_check_dir(namespaces, table))
+ set_fail(&fail, table, "Inconsistent directory names");
+ } else {
+ if ((table->strategy == sysctl_data) ||
+ (table->strategy == sysctl_string) ||
+ (table->strategy == sysctl_intvec) ||
+ (table->strategy == sysctl_jiffies) ||
+ (table->strategy == sysctl_ms_jiffies) ||
+ (table->proc_handler == proc_dostring) ||
+ (table->proc_handler == proc_dointvec) ||
+ (table->proc_handler == proc_dointvec_minmax) ||
+ (table->proc_handler == proc_dointvec_jiffies) ||
+ (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (table->proc_handler == proc_dointvec_ms_jiffies) ||
+ (table->proc_handler == proc_doulongvec_minmax) ||
+ (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!table->data)
+ set_fail(&fail, table, "No data");
+ if (!table->maxlen)
+ set_fail(&fail, table, "No maxlen");
+ }
+ if ((table->proc_handler == proc_doulongvec_minmax) ||
+ (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (table->maxlen > sizeof (unsigned long)) {
+ if (!table->extra1)
+ set_fail(&fail, table, "No min");
+ if (!table->extra2)
+ set_fail(&fail, table, "No max");
+ }
+ }
+#ifdef CONFIG_SYSCTL_SYSCALL
+ if (table->ctl_name && !table->strategy)
+ set_fail(&fail, table, "Missing strategy");
+#endif
+#if 0
+ if (!table->ctl_name && table->strategy)
+ set_fail(&fail, table, "Strategy without ctl_name");
+#endif
+#ifdef CONFIG_PROC_FS
+ if (table->procname && !table->proc_handler)
+ set_fail(&fail, table, "No proc_handler");
+#endif
+#if 0
+ if (!table->procname && table->proc_handler)
+ set_fail(&fail, table, "proc_handler without procname");
+#endif
+ sysctl_check_leaf(namespaces, table, &fail);
+ }
+ sysctl_check_bin_path(table, &fail);
+ if (table->mode > 0777)
+ set_fail(&fail, table, "bogus .mode");
+ if (fail) {
+ set_fail(&fail, table, NULL);
+ error = -EINVAL;
+ }
+ if (table->child)
+ error |= sysctl_check_table(namespaces, table->child);
+ }
+ return error;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 0000000..bd6be76
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,627 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/cgroupstats.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
+static int family_registered;
+struct kmem_cache *taskstats_cache;
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+ [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+ [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+static struct nla_policy
+cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
+ [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
+};
+
+struct listener {
+ struct list_head list;
+ pid_t pid;
+ char valid;
+};
+
+struct listener_list {
+ struct rw_semaphore sem;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+ REGISTER,
+ DEREGISTER,
+ CPU_DONT_CARE
+};
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+ size_t size)
+{
+ struct sk_buff *skb;
+ void *reply;
+
+ /*
+ * If new attributes are added, please revisit this allocation
+ */
+ skb = genlmsg_new(size, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
+ } else
+ reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+
+ *skbp = skb;
+ return 0;
+}
+
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
+ void *reply = genlmsg_data(genlhdr);
+ int rc;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return rc;
+ }
+
+ return genlmsg_unicast(skb, pid);
+}
+
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static void send_cpu_listeners(struct sk_buff *skb,
+ struct listener_list *listeners)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
+ struct listener *s, *tmp;
+ struct sk_buff *skb_next, *skb_cur = skb;
+ void *reply = genlmsg_data(genlhdr);
+ int rc, delcount = 0;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return;
+ }
+
+ rc = 0;
+ down_read(&listeners->sem);
+ list_for_each_entry(s, &listeners->list, list) {
+ skb_next = NULL;
+ if (!list_is_last(&s->list, &listeners->list)) {
+ skb_next = skb_clone(skb_cur, GFP_KERNEL);
+ if (!skb_next)
+ break;
+ }
+ rc = genlmsg_unicast(skb_cur, s->pid);
+ if (rc == -ECONNREFUSED) {
+ s->valid = 0;
+ delcount++;
+ }
+ skb_cur = skb_next;
+ }
+ up_read(&listeners->sem);
+
+ if (skb_cur)
+ nlmsg_free(skb_cur);
+
+ if (!delcount)
+ return;
+
+ /* Delete invalidated entries */
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (!s->valid) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ }
+ up_write(&listeners->sem);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *tsk,
+ struct taskstats *stats)
+{
+ int rc = 0;
+
+ if (!tsk) {
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return -ESRCH;
+ } else
+ get_task_struct(tsk);
+
+ memset(stats, 0, sizeof(*stats));
+ /*
+ * Each accounting subsystem adds calls to its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+
+ delayacct_add_tsk(stats, tsk);
+
+ /* fill in basic acct fields */
+ stats->version = TASKSTATS_VERSION;
+ stats->nvcsw = tsk->nvcsw;
+ stats->nivcsw = tsk->nivcsw;
+ bacct_add_tsk(stats, tsk);
+
+ /* fill in extended acct fields */
+ xacct_add_tsk(stats, tsk);
+
+ /* Define err: label here if needed */
+ put_task_struct(tsk);
+ return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *first,
+ struct taskstats *stats)
+{
+ struct task_struct *tsk;
+ unsigned long flags;
+ int rc = -ESRCH;
+
+ /*
+ * Add additional stats from live tasks except zombie thread group
+ * leaders who are already counted with the dead tasks
+ */
+ rcu_read_lock();
+ if (!first)
+ first = find_task_by_vpid(tgid);
+
+ if (!first || !lock_task_sighand(first, &flags))
+ goto out;
+
+ if (first->signal->stats)
+ memcpy(stats, first->signal->stats, sizeof(*stats));
+ else
+ memset(stats, 0, sizeof(*stats));
+
+ tsk = first;
+ do {
+ if (tsk->exit_state)
+ continue;
+ /*
+ * Accounting subsystem can call its functions here to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+ delayacct_add_tsk(stats, tsk);
+
+ stats->nvcsw += tsk->nvcsw;
+ stats->nivcsw += tsk->nivcsw;
+ } while_each_thread(first, tsk);
+
+ unlock_task_sighand(first, &flags);
+ rc = 0;
+out:
+ rcu_read_unlock();
+
+ stats->version = TASKSTATS_VERSION;
+ /*
+ * Accounting subsystems can also add calls here to modify
+ * fields of taskstats.
+ */
+ return rc;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ if (!tsk->signal->stats)
+ goto ret;
+
+ /*
+ * Each accounting subsystem calls its functions here to
+ * accumalate its per-task stats for tsk, into the per-tgid structure
+ *
+ * per-task-foo(tsk->signal->stats, tsk);
+ */
+ delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ return;
+}
+
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+ struct listener_list *listeners;
+ struct listener *s, *tmp;
+ unsigned int cpu;
+ cpumask_t mask = *maskp;
+
+ if (!cpus_subset(mask, cpu_possible_map))
+ return -EINVAL;
+
+ if (isadd == REGISTER) {
+ for_each_cpu_mask_nr(cpu, mask) {
+ s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!s)
+ goto cleanup;
+ s->pid = pid;
+ INIT_LIST_HEAD(&s->list);
+ s->valid = 1;
+
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_add(&s->list, &listeners->list);
+ up_write(&listeners->sem);
+ }
+ return 0;
+ }
+
+ /* Deregister or cleanup */
+cleanup:
+ for_each_cpu_mask_nr(cpu, mask) {
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (s->pid == pid) {
+ list_del(&s->list);
+ kfree(s);
+ break;
+ }
+ }
+ up_write(&listeners->sem);
+ }
+ return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+ char *data;
+ int len;
+ int ret;
+
+ if (na == NULL)
+ return 1;
+ len = nla_len(na);
+ if (len > TASKSTATS_CPUMASK_MAXLEN)
+ return -E2BIG;
+ if (len < 1)
+ return -EINVAL;
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ nla_strlcpy(data, na, len);
+ ret = cpulist_parse(data, *mask);
+ kfree(data);
+ return ret;
+}
+
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
+{
+ struct nlattr *na, *ret;
+ int aggr;
+
+ aggr = (type == TASKSTATS_TYPE_PID)
+ ? TASKSTATS_TYPE_AGGR_PID
+ : TASKSTATS_TYPE_AGGR_TGID;
+
+ na = nla_nest_start(skb, aggr);
+ if (!na)
+ goto err;
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ goto err;
+ ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+ if (!ret)
+ goto err;
+ nla_nest_end(skb, na);
+
+ return nla_data(ret);
+err:
+ return NULL;
+}
+
+static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct cgroupstats *stats;
+ struct nlattr *na;
+ size_t size;
+ u32 fd;
+ struct file *file;
+ int fput_needed;
+
+ na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
+ if (!na)
+ return -EINVAL;
+
+ fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return 0;
+
+ size = nla_total_size(sizeof(struct cgroupstats));
+
+ rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
+ size);
+ if (rc < 0)
+ goto err;
+
+ na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
+ sizeof(struct cgroupstats));
+ stats = nla_data(na);
+ memset(stats, 0, sizeof(*stats));
+
+ rc = cgroupstats_build(stats, file->f_dentry);
+ if (rc < 0) {
+ nlmsg_free(rep_skb);
+ goto err;
+ }
+
+ rc = send_reply(rep_skb, info->snd_pid);
+
+err:
+ fput_light(file, fput_needed);
+ return rc;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct taskstats *stats;
+ size_t size;
+ cpumask_t mask;
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return rc;
+
+ rc = -EINVAL;
+ if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+ if (!stats)
+ goto err;
+
+ rc = fill_pid(pid, NULL, stats);
+ if (rc < 0)
+ goto err;
+ } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+ u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ if (!stats)
+ goto err;
+
+ rc = fill_tgid(tgid, NULL, stats);
+ if (rc < 0)
+ goto err;
+ } else
+ goto err;
+
+ return send_reply(rep_skb, info->snd_pid);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct taskstats *stats;
+
+ if (sig->stats || thread_group_empty(tsk))
+ goto ret;
+
+ /* No problem if kmem_cache_zalloc() fails */
+ stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!sig->stats) {
+ sig->stats = stats;
+ stats = NULL;
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (stats)
+ kmem_cache_free(taskstats_cache, stats);
+ret:
+ return sig->stats;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit(struct task_struct *tsk, int group_dead)
+{
+ int rc;
+ struct listener_list *listeners;
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ int is_thread_group;
+
+ if (!family_registered)
+ return;
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ is_thread_group = !!taskstats_tgid_alloc(tsk);
+ if (is_thread_group) {
+ /* PID + STATS + TGID + STATS */
+ size = 2 * size;
+ /* fill the tsk->signal->stats structure */
+ fill_tgid_exit(tsk);
+ }
+
+ listeners = &__raw_get_cpu_var(listener_array);
+ if (list_empty(&listeners->list))
+ return;
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return;
+
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+ if (!stats)
+ goto err;
+
+ rc = fill_pid(-1, tsk, stats);
+ if (rc < 0)
+ goto err;
+
+ /*
+ * Doesn't matter if tsk is the leader or the last group member leaving
+ */
+ if (!is_thread_group || !group_dead)
+ goto send;
+
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+ if (!stats)
+ goto err;
+
+ memcpy(stats, tsk->signal->stats, sizeof(*stats));
+
+send:
+ send_cpu_listeners(rep_skb, listeners);
+ return;
+err:
+ nlmsg_free(rep_skb);
+}
+
+static struct genl_ops taskstats_ops = {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+};
+
+static struct genl_ops cgroupstats_ops = {
+ .cmd = CGROUPSTATS_CMD_GET,
+ .doit = cgroupstats_user_cmd,
+ .policy = cgroupstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+ unsigned int i;
+
+ taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
+ for_each_possible_cpu(i) {
+ INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+ init_rwsem(&(per_cpu(listener_array, i).sem));
+ }
+}
+
+static int __init taskstats_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&family);
+ if (rc)
+ return rc;
+
+ rc = genl_register_ops(&family, &taskstats_ops);
+ if (rc < 0)
+ goto err;
+
+ rc = genl_register_ops(&family, &cgroupstats_ops);
+ if (rc < 0)
+ goto err_cgroup_ops;
+
+ family_registered = 1;
+ printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+ return 0;
+err_cgroup_ops:
+ genl_unregister_ops(&family, &taskstats_ops);
+err:
+ genl_unregister_family(&family);
+ return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 0000000..06b6395
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,228 @@
+/*
+ * test_kprobes.c - simple sanity test for *probes
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+
+#define div_factor 3
+
+static u32 rand1, preh_val, posth_val, jph_val;
+static int errors, handler_errors, num_tests;
+
+static noinline u32 kprobe_target(u32 value)
+{
+ /*
+ * gcc ignores noinline on some architectures unless we stuff
+ * sufficient lard into the function. The get_kprobe() here is
+ * just for that.
+ *
+ * NOTE: We aren't concerned about the correctness of get_kprobe()
+ * here; hence, this call is neither under !preempt nor with the
+ * kprobe_mutex held. This is fine(tm)
+ */
+ if (get_kprobe((void *)0xdeadbeef))
+ printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
+
+ return (value / div_factor);
+}
+
+static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ preh_val = (rand1 / div_factor);
+ return 0;
+}
+
+static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ if (preh_val != (rand1 / div_factor)) {
+ handler_errors++;
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "incorrect value in post_handler\n");
+ }
+ posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp = {
+ .symbol_name = "kprobe_target",
+ .pre_handler = kp_pre_handler,
+ .post_handler = kp_post_handler
+};
+
+static int test_kprobe(void)
+{
+ int ret;
+
+ ret = register_kprobe(&kp);
+ if (ret < 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "register_kprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = kprobe_target(rand1);
+ unregister_kprobe(&kp);
+
+ if (preh_val == 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "kprobe pre_handler not called\n");
+ handler_errors++;
+ }
+
+ if (posth_val == 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "kprobe post_handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+
+static u32 j_kprobe_target(u32 value)
+{
+ if (value != rand1) {
+ handler_errors++;
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "incorrect value in jprobe handler\n");
+ }
+
+ jph_val = rand1;
+ jprobe_return();
+ return 0;
+}
+
+static struct jprobe jp = {
+ .entry = j_kprobe_target,
+ .kp.symbol_name = "kprobe_target"
+};
+
+static int test_jprobe(void)
+{
+ int ret;
+
+ ret = register_jprobe(&jp);
+ if (ret < 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "register_jprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = kprobe_target(rand1);
+ unregister_jprobe(&jp);
+ if (jph_val == 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "jprobe handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static u32 krph_val;
+
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ krph_val = (rand1 / div_factor);
+ return 0;
+}
+
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long ret = regs_return_value(regs);
+
+ if (ret != (rand1 / div_factor)) {
+ handler_errors++;
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "incorrect value in kretprobe handler\n");
+ }
+ if (krph_val == 0) {
+ handler_errors++;
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "call to kretprobe entry handler failed\n");
+ }
+
+ krph_val = rand1;
+ return 0;
+}
+
+static struct kretprobe rp = {
+ .handler = return_handler,
+ .entry_handler = entry_handler,
+ .kp.symbol_name = "kprobe_target"
+};
+
+static int test_kretprobe(void)
+{
+ int ret;
+
+ ret = register_kretprobe(&rp);
+ if (ret < 0) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "register_kretprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = kprobe_target(rand1);
+ unregister_kretprobe(&rp);
+ if (krph_val != rand1) {
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "kretprobe handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+int init_test_probes(void)
+{
+ int ret;
+
+ do {
+ rand1 = random32();
+ } while (rand1 <= div_factor);
+
+ printk(KERN_INFO "Kprobe smoke test started\n");
+ num_tests++;
+ ret = test_kprobe();
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_jprobe();
+ if (ret < 0)
+ errors++;
+
+#ifdef CONFIG_KRETPROBES
+ num_tests++;
+ ret = test_kretprobe();
+ if (ret < 0)
+ errors++;
+#endif /* CONFIG_KRETPROBES */
+
+ if (errors)
+ printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+ "%d tests failed\n", errors, num_tests);
+ else if (handler_errors)
+ printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+ "running handlers\n", handler_errors);
+ else
+ printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+
+ return 0;
+}
diff --git a/kernel/time.c b/kernel/time.c
new file mode 100644
index 0000000..8a2693a
--- /dev/null
+++ b/kernel/time.c
@@ -0,0 +1,689 @@
+/*
+ * linux/kernel/time.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file contains the interface functions for the various
+ * time related system calls: time, stime, gettimeofday, settimeofday,
+ * adjtime
+ */
+/*
+ * Modification history kernel/time.c
+ *
+ * 1993-09-02 Philip Gladstone
+ * Created file with time related functions from sched.c and adjtimex()
+ * 1993-10-08 Torsten Duwe
+ * adjtime interface update and CMOS clock write code
+ * 1995-08-13 Torsten Duwe
+ * kernel PLL updated to 1994-12-13 specs (rfc-1589)
+ * 1999-01-16 Ulrich Windl
+ * Introduced error checking for many cases in adjtimex().
+ * Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
+ * (Even though the technical memorandum forbids it)
+ * 2004-07-14 Christoph Lameter
+ * Added getnstimeofday to allow the posix timer functions to return
+ * with nanosecond accuracy
+ */
+
+#include <linux/module.h>
+#include <linux/timex.h>
+#include <linux/capability.h>
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#include "timeconst.h"
+
+/*
+ * The timezone where the local system is located. Used as a default by some
+ * programs who obtain this value by using gettimeofday.
+ */
+struct timezone sys_tz;
+
+EXPORT_SYMBOL(sys_tz);
+
+#ifdef __ARCH_WANT_SYS_TIME
+
+/*
+ * sys_time() can be implemented in user-level using
+ * sys_gettimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+SYSCALL_DEFINE1(time, time_t __user *, tloc)
+{
+ time_t i = get_seconds();
+
+ if (tloc) {
+ if (put_user(i,tloc))
+ i = -EFAULT;
+ }
+ return i;
+}
+
+/*
+ * sys_stime() can be implemented in user-level using
+ * sys_settimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+
+SYSCALL_DEFINE1(stime, time_t __user *, tptr)
+{
+ struct timespec tv;
+ int err;
+
+ if (get_user(tv.tv_sec, tptr))
+ return -EFAULT;
+
+ tv.tv_nsec = 0;
+
+ err = security_settime(&tv, NULL);
+ if (err)
+ return err;
+
+ do_settimeofday(&tv);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_SYS_TIME */
+
+SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ if (likely(tv != NULL)) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (copy_to_user(tv, &ktv, sizeof(ktv)))
+ return -EFAULT;
+ }
+ if (unlikely(tz != NULL)) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives. Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours) or
+ * compile in the timezone information into the kernel. Bad, bad....
+ *
+ * - TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+static inline void warp_clock(void)
+{
+ write_seqlock_irq(&xtime_lock);
+ wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
+ xtime.tv_sec += sys_tz.tz_minuteswest * 60;
+ update_xtime_cache(0);
+ write_sequnlock_irq(&xtime_lock);
+ clock_was_set();
+}
+
+/*
+ * In case for some reason the CMOS clock has not already been running
+ * in UTC, but in some local time: The first time we set the timezone,
+ * we will warp the clock so that it is ticking UTC time instead of
+ * local time. Presumably, if someone is setting the timezone then we
+ * are running in an environment where the programs understand about
+ * timezones. This should be done at boot time in the /etc/rc script,
+ * as soon as possible, so that the clock can be set right. Otherwise,
+ * various programs will get confused when the clock gets warped.
+ */
+
+int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+{
+ static int firsttime = 1;
+ int error = 0;
+
+ if (tv && !timespec_valid(tv))
+ return -EINVAL;
+
+ error = security_settime(tv, tz);
+ if (error)
+ return error;
+
+ if (tz) {
+ /* SMP safe, global irq locking makes it work. */
+ sys_tz = *tz;
+ update_vsyscall_tz();
+ if (firsttime) {
+ firsttime = 0;
+ if (!tv)
+ warp_clock();
+ }
+ }
+ if (tv)
+ {
+ /* SMP safe, again the code in arch/foo/time.c should
+ * globally block out interrupts when it runs.
+ */
+ return do_settimeofday(tv);
+ }
+ return 0;
+}
+
+SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
+
+ if (tv) {
+ if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+ return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ }
+ if (tz) {
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+}
+
+SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+{
+ struct timex txc; /* Local copy of parameter */
+ int ret;
+
+ /* Copy the user data space into the kernel copy
+ * structure. But bear in mind that the structures
+ * may change
+ */
+ if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ return -EFAULT;
+ ret = do_adjtimex(&txc);
+ return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+}
+
+/**
+ * current_fs_time - Return FS time
+ * @sb: Superblock.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs.
+ */
+struct timespec current_fs_time(struct super_block *sb)
+{
+ struct timespec now = current_kernel_time();
+ return timespec_trunc(now, sb->s_time_gran);
+}
+EXPORT_SYMBOL(current_fs_time);
+
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+# if BITS_PER_LONG == 32
+ return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+ return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+# if BITS_PER_LONG == 32
+ return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+ return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
+/**
+ * timespec_trunc - Truncate timespec to a granularity
+ * @t: Timespec
+ * @gran: Granularity in ns.
+ *
+ * Truncate a timespec to a granularity. gran must be smaller than a second.
+ * Always rounds down.
+ *
+ * This function should be only used for timestamps returned by
+ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
+ * it doesn't handle the better resolution of the latter.
+ */
+struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+ /*
+ * Division is pretty slow so avoid it for common cases.
+ * Currently current_kernel_time() never returns better than
+ * jiffies resolution. Exploit that.
+ */
+ if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* nothing */
+ } else if (gran == 1000000000) {
+ t.tv_nsec = 0;
+ } else {
+ t.tv_nsec -= t.tv_nsec % gran;
+ }
+ return t;
+}
+EXPORT_SYMBOL(timespec_trunc);
+
+#ifndef CONFIG_GENERIC_TIME
+/*
+ * Simulate gettimeofday using do_gettimeofday which only allows a timeval
+ * and therefore only yields usec accuracy
+ */
+void getnstimeofday(struct timespec *tv)
+{
+ struct timeval x;
+
+ do_gettimeofday(&x);
+ tv->tv_sec = x.tv_sec;
+ tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
+}
+EXPORT_SYMBOL_GPL(getnstimeofday);
+#endif
+
+/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ *
+ * WARNING: this function will overflow on 2106-02-07 06:28:16 on
+ * machines where long is 32-bit! (However, as time_t is signed, we
+ * will already get problems at other places on 2038-01-19 03:14:08)
+ */
+unsigned long
+mktime(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
+{
+ unsigned int mon = mon0, year = year0;
+
+ /* 1..12 -> 11,12,1..10 */
+ if (0 >= (int) (mon -= 2)) {
+ mon += 12; /* Puts Feb last since it has leap day */
+ year -= 1;
+ }
+
+ return ((((unsigned long)
+ (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+ year*365 - 719499
+ )*24 + hour /* now have hours */
+ )*60 + min /* now have minutes */
+ )*60 + sec; /* finally seconds */
+}
+
+EXPORT_SYMBOL(mktime);
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec);
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+struct timespec ns_to_timespec(const s64 nsec)
+{
+ struct timespec ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec);
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+struct timeval ns_to_timeval(const s64 nsec)
+{
+ struct timespec ts = ns_to_timespec(nsec);
+ struct timeval tv;
+
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+
+ return tv;
+}
+EXPORT_SYMBOL(ns_to_timeval);
+
+/*
+ * When we convert to jiffies then we interpret incoming values
+ * the following way:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ * the input value by a factor or dividing it with a factor
+ *
+ * We must also be careful about 32-bit overflows.
+ */
+unsigned long msecs_to_jiffies(const unsigned int m)
+{
+ /*
+ * Negative value, means infinite timeout:
+ */
+ if ((int)m < 0)
+ return MAX_JIFFY_OFFSET;
+
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ /*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice
+ * round multiple of HZ, divide with the factor between them,
+ * but round upwards:
+ */
+ return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ /*
+ * HZ is larger than 1000, and HZ is a nice round multiple of
+ * 1000 - simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot
+ * overflow:
+ */
+ if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return m * (HZ / MSEC_PER_SEC);
+#else
+ /*
+ * Generic case - multiply, round and divide. But first
+ * check that if we are doing a net multiplication, that
+ * we wouldn't overflow:
+ */
+ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+ >> MSEC_TO_HZ_SHR32;
+#endif
+}
+EXPORT_SYMBOL(msecs_to_jiffies);
+
+unsigned long usecs_to_jiffies(const unsigned int u)
+{
+ if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return u * (HZ / USEC_PER_SEC);
+#else
+ return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+ >> USEC_TO_HZ_SHR32;
+#endif
+}
+EXPORT_SYMBOL(usecs_to_jiffies);
+
+/*
+ * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
+ * that a remainder subtract here would not do the right thing as the
+ * resolution values don't fall on second boundries. I.e. the line:
+ * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ *
+ * Rather, we just shift the bits off the right.
+ *
+ * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+ * value to a scaled second value.
+ */
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+ unsigned long sec = value->tv_sec;
+ long nsec = value->tv_nsec + TICK_NSEC - 1;
+
+ if (sec >= MAX_SEC_IN_JIFFIES){
+ sec = MAX_SEC_IN_JIFFIES;
+ nsec = 0;
+ }
+ return (((u64)sec * SEC_CONVERSION) +
+ (((u64)nsec * NSEC_CONVERSION) >>
+ (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+
+}
+EXPORT_SYMBOL(timespec_to_jiffies);
+
+void
+jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u32 rem;
+ value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+ NSEC_PER_SEC, &rem);
+ value->tv_nsec = rem;
+}
+EXPORT_SYMBOL(jiffies_to_timespec);
+
+/* Same for "timeval"
+ *
+ * Well, almost. The problem here is that the real system resolution is
+ * in nanoseconds and the value being converted is in micro seconds.
+ * Also for some machines (those that use HZ = 1024, in-particular),
+ * there is a LARGE error in the tick size in microseconds.
+
+ * The solution we use is to do the rounding AFTER we convert the
+ * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
+ * Instruction wise, this should cost only an additional add with carry
+ * instruction above the way it was done above.
+ */
+unsigned long
+timeval_to_jiffies(const struct timeval *value)
+{
+ unsigned long sec = value->tv_sec;
+ long usec = value->tv_usec;
+
+ if (sec >= MAX_SEC_IN_JIFFIES){
+ sec = MAX_SEC_IN_JIFFIES;
+ usec = 0;
+ }
+ return (((u64)sec * SEC_CONVERSION) +
+ (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
+ (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+EXPORT_SYMBOL(timeval_to_jiffies);
+
+void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u32 rem;
+
+ value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+ NSEC_PER_SEC, &rem);
+ value->tv_usec = rem / NSEC_PER_USEC;
+}
+EXPORT_SYMBOL(jiffies_to_timeval);
+
+/*
+ * Convert jiffies/jiffies_64 to clock_t and back.
+ */
+clock_t jiffies_to_clock_t(long x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ return x * (USER_HZ / HZ);
+# else
+ return x / (HZ / USER_HZ);
+# endif
+#else
+ return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_clock_t);
+
+unsigned long clock_t_to_jiffies(unsigned long x)
+{
+#if (HZ % USER_HZ)==0
+ if (x >= ~0UL / (HZ / USER_HZ))
+ return ~0UL;
+ return x * (HZ / USER_HZ);
+#else
+ /* Don't worry about loss of precision here .. */
+ if (x >= ~0UL / HZ * USER_HZ)
+ return ~0UL;
+
+ /* .. but do try to contain it here */
+ return div_u64((u64)x * HZ, USER_HZ);
+#endif
+}
+EXPORT_SYMBOL(clock_t_to_jiffies);
+
+u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ x = div_u64(x * USER_HZ, HZ);
+# elif HZ > USER_HZ
+ x = div_u64(x, HZ / USER_HZ);
+# else
+ /* Nothing to do */
+# endif
+#else
+ /*
+ * There are better ways that don't overflow early,
+ * but even this doesn't overflow in hundreds of years
+ * in 64 bits, so..
+ */
+ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
+#endif
+ return x;
+}
+EXPORT_SYMBOL(jiffies_64_to_clock_t);
+
+u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+ return div_u64(x, NSEC_PER_SEC / USER_HZ);
+#elif (USER_HZ % 512) == 0
+ return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
+#else
+ /*
+ * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+ * overflow after 64.99 years.
+ * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+ */
+ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
+#endif
+}
+
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ ret = jiffies_64;
+ } while (read_seqretry(&xtime_lock, seq));
+ return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs)
+{
+ struct timespec res;
+
+ set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+ res.tv_sec = TIME_T_MAX;
+
+ return res;
+}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 0000000..95ed429
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,29 @@
+#
+# Timer subsystem related configuration options
+#
+config TICK_ONESHOT
+ bool
+
+config NO_HZ
+ bool "Tickless System (Dynamic Ticks)"
+ depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables a tickless system: timer interrupts will
+ only trigger on an as-needed basis both when the system is
+ busy and when the system is idle.
+
+config HIGH_RES_TIMERS
+ bool "High Resolution Timer Support"
+ depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables high resolution timer support. If your
+ hardware is not capable then this option only increases
+ the size of the kernel image.
+
+config GENERIC_CLOCKEVENTS_BUILD
+ bool
+ default y
+ depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 0000000..905b0b5
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1,8 @@
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 0000000..f8d9680
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,250 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event devices.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+
+/* The registered clock event devices */
+static LIST_HEAD(clockevent_devices);
+static LIST_HEAD(clockevents_released);
+
+/* Notification for clock events */
+static RAW_NOTIFIER_HEAD(clockevents_chain);
+
+/* Protection for the above */
+static DEFINE_SPINLOCK(clockevents_lock);
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+unsigned long clockevent_delta2ns(unsigned long latch,
+ struct clock_event_device *evt)
+{
+ u64 clc = ((u64) latch << evt->shift);
+
+ if (unlikely(!evt->mult)) {
+ evt->mult = 1;
+ WARN_ON(1);
+ }
+
+ do_div(clc, evt->mult);
+ if (clc < 1000)
+ clc = 1000;
+ if (clc > LONG_MAX)
+ clc = LONG_MAX;
+
+ return (unsigned long) clc;
+}
+
+/**
+ * clockevents_set_mode - set the operating mode of a clock event device
+ * @dev: device to modify
+ * @mode: new mode
+ *
+ * Must be called with interrupts disabled !
+ */
+void clockevents_set_mode(struct clock_event_device *dev,
+ enum clock_event_mode mode)
+{
+ if (dev->mode != mode) {
+ dev->set_mode(mode, dev);
+ dev->mode = mode;
+ }
+}
+
+/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev: device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ dev->next_event.tv64 = KTIME_MAX;
+}
+
+/**
+ * clockevents_program_event - Reprogram the clock event device.
+ * @expires: absolute expiry time (monotonic clock)
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
+ ktime_t now)
+{
+ unsigned long long clc;
+ int64_t delta;
+
+ if (unlikely(expires.tv64 < 0)) {
+ WARN_ON_ONCE(1);
+ return -ETIME;
+ }
+
+ delta = ktime_to_ns(ktime_sub(expires, now));
+
+ if (delta <= 0)
+ return -ETIME;
+
+ dev->next_event = expires;
+
+ if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ return 0;
+
+ if (delta > dev->max_delta_ns)
+ delta = dev->max_delta_ns;
+ if (delta < dev->min_delta_ns)
+ delta = dev->min_delta_ns;
+
+ clc = delta * dev->mult;
+ clc >>= dev->shift;
+
+ return dev->set_next_event((unsigned long) clc, dev);
+}
+
+/**
+ * clockevents_register_notifier - register a clock events change listener
+ */
+int clockevents_register_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ spin_lock(&clockevents_lock);
+ ret = raw_notifier_chain_register(&clockevents_chain, nb);
+ spin_unlock(&clockevents_lock);
+
+ return ret;
+}
+
+/*
+ * Notify about a clock event change. Called with clockevents_lock
+ * held.
+ */
+static void clockevents_do_notify(unsigned long reason, void *dev)
+{
+ raw_notifier_call_chain(&clockevents_chain, reason, dev);
+}
+
+/*
+ * Called after a notify add to make devices available which were
+ * released from the notifier call.
+ */
+static void clockevents_notify_released(void)
+{
+ struct clock_event_device *dev;
+
+ while (!list_empty(&clockevents_released)) {
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ }
+}
+
+/**
+ * clockevents_register_device - register a clock event device
+ * @dev: device to register
+ */
+void clockevents_register_device(struct clock_event_device *dev)
+{
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ /*
+ * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+ * on it, so fix it up and emit a warning:
+ */
+ if (unlikely(!dev->mult)) {
+ dev->mult = 1;
+ WARN_ON(1);
+ }
+
+ spin_lock(&clockevents_lock);
+
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ clockevents_notify_released();
+
+ spin_unlock(&clockevents_lock);
+}
+
+/*
+ * Noop handler when we shut down an event device
+ */
+void clockevents_handle_noop(struct clock_event_device *dev)
+{
+}
+
+/**
+ * clockevents_exchange_device - release and request clock devices
+ * @old: device to release (can be NULL)
+ * @new: device to request (can be NULL)
+ *
+ * Called from the notifier chain. clockevents_lock is held already
+ */
+void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /*
+ * Caller releases a clock event device. We queue it into the
+ * released list and do a notify add later.
+ */
+ if (old) {
+ clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ list_del(&old->list);
+ list_add(&old->list, &clockevents_released);
+ }
+
+ if (new) {
+ BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ clockevents_shutdown(new);
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+/**
+ * clockevents_notify - notification about relevant events
+ */
+void clockevents_notify(unsigned long reason, void *arg)
+{
+ struct list_head *node, *tmp;
+
+ spin_lock(&clockevents_lock);
+ clockevents_do_notify(reason, arg);
+
+ switch (reason) {
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_safe(node, tmp, &clockevents_released)
+ list_del(node);
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&clockevents_lock);
+}
+EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 0000000..9ed2eec
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,554 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ * o Allow clocksource drivers to be unregistered
+ * o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ * currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ * pending next selected clocksource.
+ * clocksource_list:
+ * linked list with the registered clocksources
+ * clocksource_lock:
+ * protects manipulations to curr_clocksource and next_clocksource
+ * and the clocksource_list
+ * override_name:
+ * Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static struct clocksource *clocksource_override;
+static LIST_HEAD(clocksource_list);
+static DEFINE_SPINLOCK(clocksource_lock);
+static char override_name[32];
+static int finished_booting;
+
+/* clocksource_done_booting - Called near the end of core bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
+ */
+static int __init clocksource_done_booting(void)
+{
+ finished_booting = 1;
+ return 0;
+}
+fs_initcall(clocksource_done_booting);
+
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static LIST_HEAD(watchdog_list);
+static struct clocksource *watchdog;
+static struct timer_list watchdog_timer;
+static DEFINE_SPINLOCK(watchdog_lock);
+static cycle_t watchdog_last;
+static unsigned long watchdog_resumed;
+
+/*
+ * Interval: 0.5sec Threshold: 0.0625s
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+
+static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+{
+ if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
+ return;
+
+ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+ cs->name, delta);
+ cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+ clocksource_change_rating(cs, 0);
+ list_del(&cs->wd_list);
+}
+
+static void clocksource_watchdog(unsigned long data)
+{
+ struct clocksource *cs, *tmp;
+ cycle_t csnow, wdnow;
+ int64_t wd_nsec, cs_nsec;
+ int resumed;
+
+ spin_lock(&watchdog_lock);
+
+ resumed = test_and_clear_bit(0, &watchdog_resumed);
+
+ wdnow = watchdog->read();
+ wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+ watchdog_last = wdnow;
+
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+ csnow = cs->read();
+
+ if (unlikely(resumed)) {
+ cs->wd_last = csnow;
+ continue;
+ }
+
+ /* Initialized ? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+ if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /*
+ * We just marked the clocksource as
+ * highres-capable, notify the rest of the
+ * system as well so that we transition
+ * into high-res mode:
+ */
+ tick_clock_notify();
+ }
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ cs->wd_last = csnow;
+ } else {
+ cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
+ cs->wd_last = csnow;
+ /* Check the delta. Might remove from the list ! */
+ clocksource_ratewd(cs, cs_nsec - wd_nsec);
+ }
+ }
+
+ if (!list_empty(&watchdog_list)) {
+ /*
+ * Cycle through CPUs to check if the CPUs stay
+ * synchronized to each other.
+ */
+ int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
+
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = first_cpu(cpu_online_map);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
+ spin_unlock(&watchdog_lock);
+}
+static void clocksource_resume_watchdog(void)
+{
+ set_bit(0, &watchdog_resumed);
+}
+
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+ struct clocksource *cse;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ int started = !list_empty(&watchdog_list);
+
+ list_add(&cs->wd_list, &watchdog_list);
+ if (!started && watchdog) {
+ watchdog_last = watchdog->read();
+ watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer,
+ first_cpu(cpu_online_map));
+ }
+ } else {
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+
+ if (!watchdog || cs->rating > watchdog->rating) {
+ if (watchdog)
+ del_timer(&watchdog_timer);
+ watchdog = cs;
+ init_timer(&watchdog_timer);
+ watchdog_timer.function = clocksource_watchdog;
+
+ /* Reset watchdog cycles */
+ list_for_each_entry(cse, &watchdog_list, wd_list)
+ cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ /* Start if list is not empty */
+ if (!list_empty(&watchdog_list)) {
+ watchdog_last = watchdog->read();
+ watchdog_timer.expires =
+ jiffies + WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer,
+ first_cpu(cpu_online_map));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+#else
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+}
+
+static inline void clocksource_resume_watchdog(void) { }
+#endif
+
+/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct clocksource *cs;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (cs->resume)
+ cs->resume();
+ }
+
+ clocksource_resume_watchdog();
+
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
+ * clocksource_touch_watchdog - Update watchdog
+ *
+ * Update the watchdog after exception contexts such as kgdb so as not
+ * to incorrectly trip the watchdog.
+ *
+ */
+void clocksource_touch_watchdog(void)
+{
+ clocksource_resume_watchdog();
+}
+
+/**
+ * clocksource_get_next - Returns the selected clocksource
+ *
+ */
+struct clocksource *clocksource_get_next(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ if (next_clocksource && finished_booting) {
+ curr_clocksource = next_clocksource;
+ next_clocksource = NULL;
+ }
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+
+ return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Selects the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
+ */
+static struct clocksource *select_clocksource(void)
+{
+ struct clocksource *next;
+
+ if (list_empty(&clocksource_list))
+ return NULL;
+
+ if (clocksource_override)
+ next = clocksource_override;
+ else
+ next = list_entry(clocksource_list.next, struct clocksource,
+ list);
+
+ if (next == curr_clocksource)
+ return NULL;
+
+ return next;
+}
+
+/*
+ * Enqueue the clocksource sorted by rating
+ */
+static int clocksource_enqueue(struct clocksource *c)
+{
+ struct list_head *tmp, *entry = &clocksource_list;
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (cs == c)
+ return -EBUSY;
+ /* Keep track of the place, where to insert */
+ if (cs->rating >= c->rating)
+ entry = tmp;
+ }
+ list_add(&c->list, entry);
+
+ if (strlen(c->name) == strlen(override_name) &&
+ !strcmp(c->name, override_name))
+ clocksource_override = c;
+
+ return 0;
+}
+
+/**
+ * clocksource_register - Used to install new clocksources
+ * @t: clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int clocksource_register(struct clocksource *c)
+{
+ unsigned long flags;
+ int ret;
+
+ /* save mult_orig on registration */
+ c->mult_orig = c->mult;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ ret = clocksource_enqueue(c);
+ if (!ret)
+ next_clocksource = select_clocksource();
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+ if (!ret)
+ clocksource_check_watchdog(c);
+ return ret;
+}
+EXPORT_SYMBOL(clocksource_register);
+
+/**
+ * clocksource_change_rating - Change the rating of a registered clocksource
+ *
+ */
+void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+ next_clocksource = select_clocksource();
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
+ * clocksource_unregister - remove a registered clocksource
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ list_del(&cs->list);
+ if (clocksource_override == cs)
+ clocksource_override = NULL;
+ next_clocksource = select_clocksource();
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+#ifdef CONFIG_SYSFS
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
+{
+ ssize_t count = 0;
+
+ spin_lock_irq(&clocksource_lock);
+ count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ spin_unlock_irq(&clocksource_lock);
+
+ return count;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev: unused
+ * @buf: name of override clocksource
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct clocksource *ovr = NULL;
+ size_t ret = count;
+ int len;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(override_name))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+
+ spin_lock_irq(&clocksource_lock);
+
+ if (count > 0)
+ memcpy(override_name, buf, count);
+ override_name[count] = 0;
+
+ len = strlen(override_name);
+ if (len) {
+ struct clocksource *cs;
+
+ ovr = clocksource_override;
+ /* try to select it: */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (strlen(cs->name) == len &&
+ !strcmp(cs->name, override_name))
+ ovr = cs;
+ }
+ }
+
+ /* Reselect, when the override name has changed */
+ if (ovr != clocksource_override) {
+ clocksource_override = ovr;
+ next_clocksource = select_clocksource();
+ }
+
+ spin_unlock_irq(&clocksource_lock);
+
+ return ret;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ struct clocksource *src;
+ ssize_t count = 0;
+
+ spin_lock_irq(&clocksource_lock);
+ list_for_each_entry(src, &clocksource_list, list) {
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+ "%s ", src->name);
+ }
+ spin_unlock_irq(&clocksource_lock);
+
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
+
+ return count;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+ sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0444,
+ sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+ .name = "clocksource",
+};
+
+static struct sys_device device_clocksource = {
+ .id = 0,
+ .cls = &clocksource_sysclass,
+};
+
+static int __init init_clocksource_sysfs(void)
+{
+ int error = sysdev_class_register(&clocksource_sysclass);
+
+ if (!error)
+ error = sysdev_register(&device_clocksource);
+ if (!error)
+ error = sysdev_create_file(
+ &device_clocksource,
+ &attr_current_clocksource);
+ if (!error)
+ error = sysdev_create_file(
+ &device_clocksource,
+ &attr_available_clocksource);
+ return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str: override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&clocksource_lock, flags);
+ if (str)
+ strlcpy(override_name, str, sizeof(override_name));
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+ return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str: override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+ if (!strcmp(str, "pmtmr")) {
+ printk("Warning: clock=pmtmr is deprecated. "
+ "Use clocksource=acpi_pm.\n");
+ return boot_override_clocksource("acpi_pm");
+ }
+ printk("Warning! clock= boot option is deprecated. "
+ "Use clocksource=xyz\n");
+ return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 0000000..1ca9955
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT 8
+
+static cycle_t jiffies_read(void)
+{
+ return (cycle_t) jiffies;
+}
+
+struct clocksource clocksource_jiffies = {
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .read = jiffies_read,
+ .mask = 0xffffffff, /*32bits*/
+ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
+ .shift = JIFFIES_SHIFT,
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+ return clocksource_register(&clocksource_jiffies);
+}
+
+core_initcall(init_jiffies_clocksource);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 0000000..8ff15e5
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,453 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/capability.h>
+#include <linux/math64.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <asm/timex.h>
+
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
+unsigned long tick_nsec; /* ACTHZ period (nsec) */
+u64 tick_length;
+static u64 tick_length_base;
+
+static struct hrtimer leap_timer;
+
+#define MAX_TICKADJ 500 /* microsecs */
+#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+ NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+static int time_state = TIME_OK; /* clock synchronization status */
+int time_status = STA_UNSYNC; /* clock status bits */
+static long time_tai; /* TAI offset (s) */
+static s64 time_offset; /* time adjustment (ns) */
+static long time_constant = 2; /* pll time constant */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
+static s64 time_freq; /* frequency offset (scaled ns/s)*/
+static long time_reftime; /* time at last adjustment (s) */
+long time_adjust;
+static long ntp_tick_adj;
+
+static void ntp_update_frequency(void)
+{
+ u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+ << NTP_SCALE_SHIFT;
+ second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
+ second_length += time_freq;
+
+ tick_length_base = second_length;
+
+ tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
+ tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+}
+
+static void ntp_update_offset(long offset)
+{
+ long mtemp;
+ s64 freq_adj;
+
+ if (!(time_status & STA_PLL))
+ return;
+
+ if (!(time_status & STA_NANO))
+ offset *= NSEC_PER_USEC;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ offset = min(offset, MAXPHASE);
+ offset = max(offset, -MAXPHASE);
+
+ /*
+ * Select how the frequency is to be controlled
+ * and in which mode (PLL or FLL).
+ */
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = xtime.tv_sec;
+ mtemp = xtime.tv_sec - time_reftime;
+ time_reftime = xtime.tv_sec;
+
+ freq_adj = (s64)offset * mtemp;
+ freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
+ time_status &= ~STA_MODE;
+ if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
+ mtemp);
+ time_status |= STA_MODE;
+ }
+ freq_adj += time_freq;
+ freq_adj = min(freq_adj, MAXFREQ_SCALED);
+ time_freq = max(freq_adj, -MAXFREQ_SCALED);
+
+ time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+
+ ntp_update_frequency();
+
+ tick_length = tick_length_base;
+ time_offset = 0;
+}
+
+/*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
+static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+{
+ enum hrtimer_restart res = HRTIMER_NORESTART;
+
+ write_seqlock_irq(&xtime_lock);
+
+ switch (time_state) {
+ case TIME_OK:
+ break;
+ case TIME_INS:
+ xtime.tv_sec--;
+ wall_to_monotonic.tv_sec++;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE "Clock: "
+ "inserting leap second 23:59:60 UTC\n");
+ hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
+ res = HRTIMER_RESTART;
+ break;
+ case TIME_DEL:
+ xtime.tv_sec++;
+ time_tai--;
+ wall_to_monotonic.tv_sec--;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE "Clock: "
+ "deleting leap second 23:59:59 UTC\n");
+ break;
+ case TIME_OOP:
+ time_tai++;
+ time_state = TIME_WAIT;
+ /* fall through */
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ }
+ update_vsyscall(&xtime, clock);
+
+ write_sequnlock_irq(&xtime_lock);
+
+ return res;
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+ s64 time_adj;
+
+ /* Bump the maxerror field */
+ time_maxerror += MAXFREQ / NSEC_PER_USEC;
+ if (time_maxerror > NTP_PHASE_LIMIT) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /*
+ * Compute the phase adjustment for the next second. The offset is
+ * reduced by a fixed factor times the time constant.
+ */
+ tick_length = tick_length_base;
+ time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
+ time_offset -= time_adj;
+ tick_length += time_adj;
+
+ if (unlikely(time_adjust)) {
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ } else if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ } else {
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+ NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
+ time_adjust = 0;
+ }
+ }
+}
+
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+
+/* Disable the cmos update - used by virtualization and embedded */
+int no_sync_cmos_clock __read_mostly;
+
+static void sync_cmos_clock(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
+
+static void sync_cmos_clock(struct work_struct *work)
+{
+ struct timespec now, next;
+ int fail = 1;
+
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+ * called as close as possible to 500 ms before the new second starts.
+ * This code is run on a timer. If the clock is set, that timer
+ * may not expire at the correct time. Thus, we adjust...
+ */
+ if (!ntp_synced())
+ /*
+ * Not synced, exit, do not restart a timer (if one is
+ * running, let it run out).
+ */
+ return;
+
+ getnstimeofday(&now);
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+ fail = update_persistent_clock(now);
+
+ next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
+ if (next.tv_nsec <= 0)
+ next.tv_nsec += NSEC_PER_SEC;
+
+ if (!fail)
+ next.tv_sec = 659;
+ else
+ next.tv_sec = 0;
+
+ if (next.tv_nsec >= NSEC_PER_SEC) {
+ next.tv_sec++;
+ next.tv_nsec -= NSEC_PER_SEC;
+ }
+ schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
+}
+
+static void notify_cmos_timer(void)
+{
+ if (!no_sync_cmos_clock)
+ schedule_delayed_work(&sync_cmos_work, 0);
+}
+
+#else
+static inline void notify_cmos_timer(void) { }
+#endif
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+ struct timespec ts;
+ int result;
+
+ /* Validate the data before disabling interrupts */
+ if (txc->modes & ADJ_ADJTIME) {
+ /* singleshot must not be used with any other mode bits */
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* if the quartz is off by more than 10% something is VERY wrong! */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+
+ if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+ hrtimer_cancel(&leap_timer);
+ }
+
+ getnstimeofday(&ts);
+
+ write_seqlock_irq(&xtime_lock);
+
+ /* If there are input parameters, then process them */
+ if (txc->modes & ADJ_ADJTIME) {
+ long save_adjust = time_adjust;
+
+ if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ ntp_update_frequency();
+ }
+ txc->offset = save_adjust;
+ goto adj_done;
+ }
+ if (txc->modes) {
+ long sec;
+
+ if (txc->modes & ADJ_STATUS) {
+ if ((time_status & STA_PLL) &&
+ !(txc->status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+ }
+ /* only set allowed bits */
+ time_status &= STA_RONLY;
+ time_status |= txc->status & ~STA_RONLY;
+
+ switch (time_state) {
+ case TIME_OK:
+ start_timer:
+ sec = ts.tv_sec;
+ if (time_status & STA_INS) {
+ time_state = TIME_INS;
+ sec += 86400 - sec % 86400;
+ hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+ } else if (time_status & STA_DEL) {
+ time_state = TIME_DEL;
+ sec += 86400 - (sec + 1) % 86400;
+ hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+ }
+ break;
+ case TIME_INS:
+ case TIME_DEL:
+ time_state = TIME_OK;
+ goto start_timer;
+ break;
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ case TIME_OOP:
+ hrtimer_restart(&leap_timer);
+ break;
+ }
+ }
+
+ if (txc->modes & ADJ_NANO)
+ time_status |= STA_NANO;
+ if (txc->modes & ADJ_MICRO)
+ time_status &= ~STA_NANO;
+
+ if (txc->modes & ADJ_FREQUENCY) {
+ time_freq = (s64)txc->freq * PPM_SCALE;
+ time_freq = min(time_freq, MAXFREQ_SCALED);
+ time_freq = max(time_freq, -MAXFREQ_SCALED);
+ }
+
+ if (txc->modes & ADJ_MAXERROR)
+ time_maxerror = txc->maxerror;
+ if (txc->modes & ADJ_ESTERROR)
+ time_esterror = txc->esterror;
+
+ if (txc->modes & ADJ_TIMECONST) {
+ time_constant = txc->constant;
+ if (!(time_status & STA_NANO))
+ time_constant += 4;
+ time_constant = min(time_constant, (long)MAXTC);
+ time_constant = max(time_constant, 0l);
+ }
+
+ if (txc->modes & ADJ_TAI && txc->constant > 0)
+ time_tai = txc->constant;
+
+ if (txc->modes & ADJ_OFFSET)
+ ntp_update_offset(txc->offset);
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+ }
+
+ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ NTP_SCALE_SHIFT);
+ if (!(time_status & STA_NANO))
+ txc->offset /= NSEC_PER_USEC;
+
+adj_done:
+ result = time_state; /* mostly `TIME_OK' */
+ if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ result = TIME_ERROR;
+
+ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = 1;
+ txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
+ txc->tick = tick_usec;
+ txc->tai = time_tai;
+
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+ write_sequnlock_irq(&xtime_lock);
+
+ txc->time.tv_sec = ts.tv_sec;
+ txc->time.tv_usec = ts.tv_nsec;
+ if (!(time_status & STA_NANO))
+ txc->time.tv_usec /= NSEC_PER_USEC;
+
+ notify_cmos_timer();
+
+ return result;
+}
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ ntp_tick_adj = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
+
+void __init ntp_init(void)
+{
+ ntp_clear();
+ hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ leap_timer.function = ntp_leap_second;
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 0000000..f98a1b7
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,601 @@
+/*
+ * linux/kernel/time/tick-broadcast.c
+ *
+ * This file contains functions which emulate a local clock-event
+ * device via a broadcast event source.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/*
+ * Broadcast support for broken x86 hardware, where the local apic
+ * timer stops in C3 state.
+ */
+
+struct tick_device tick_broadcast_device;
+static cpumask_t tick_broadcast_mask;
+static DEFINE_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_force;
+
+#ifdef CONFIG_TICK_ONESHOT
+static void tick_broadcast_clear_oneshot(int cpu);
+#else
+static inline void tick_broadcast_clear_oneshot(int cpu) { }
+#endif
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_broadcast_device(void)
+{
+ return &tick_broadcast_device;
+}
+
+cpumask_t *tick_get_broadcast_mask(void)
+{
+ return &tick_broadcast_mask;
+}
+
+/*
+ * Start the device in periodic mode
+ */
+static void tick_broadcast_start_periodic(struct clock_event_device *bc)
+{
+ if (bc)
+ tick_setup_periodic(bc, 1);
+}
+
+/*
+ * Check, if the device can be utilized as broadcast device:
+ */
+int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ if ((tick_broadcast_device.evtdev &&
+ tick_broadcast_device.evtdev->rating >= dev->rating) ||
+ (dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ clockevents_exchange_device(NULL, dev);
+ tick_broadcast_device.evtdev = dev;
+ if (!cpus_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(dev);
+ return 1;
+}
+
+/*
+ * Check, if the device is the broadcast device
+ */
+int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return (dev && tick_broadcast_device.evtdev == dev);
+}
+
+/*
+ * Check, if the device is disfunctional and a place holder, which
+ * needs to be handled by the broadcast device.
+ */
+int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Devices might be registered with both periodic and oneshot
+ * mode disabled. This signals, that the device needs to be
+ * operated from the broadcast device and is a placeholder for
+ * the cpu local device.
+ */
+ if (!tick_device_is_functional(dev)) {
+ dev->event_handler = tick_handle_periodic;
+ cpu_set(cpu, tick_broadcast_mask);
+ tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+ ret = 1;
+ } else {
+ /*
+ * When the new device is not affected by the stop
+ * feature and the cpu is marked in the broadcast mask
+ * then clear the broadcast bit.
+ */
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, tick_broadcast_mask);
+ tick_broadcast_clear_oneshot(cpu);
+ }
+ }
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
+}
+
+/*
+ * Broadcast the event to the cpus, which are set in the mask
+ */
+static void tick_do_broadcast(cpumask_t mask)
+{
+ int cpu = smp_processor_id();
+ struct tick_device *td;
+
+ /*
+ * Check, if the current cpu is in the mask
+ */
+ if (cpu_isset(cpu, mask)) {
+ cpu_clear(cpu, mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->event_handler(td->evtdev);
+ }
+
+ if (!cpus_empty(mask)) {
+ /*
+ * It might be necessary to actually check whether the devices
+ * have different broadcast functions. For now, just use the
+ * one of the first device. This works as long as we have this
+ * misfeature only on x86 (lapic)
+ */
+ cpu = first_cpu(mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->broadcast(mask);
+ }
+}
+
+/*
+ * Periodic broadcast:
+ * - invoke the broadcast handlers
+ */
+static void tick_do_periodic_broadcast(void)
+{
+ cpumask_t mask;
+
+ spin_lock(&tick_broadcast_lock);
+
+ cpus_and(mask, cpu_online_map, tick_broadcast_mask);
+ tick_do_broadcast(mask);
+
+ spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Event handler for periodic broadcast ticks
+ */
+static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
+{
+ ktime_t next;
+
+ tick_do_periodic_broadcast();
+
+ /*
+ * The device is in periodic mode. No reprogramming necessary:
+ */
+ if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ return;
+
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode. We read dev->next_event first and add to it
+ * when the event alrady expired. clockevents_program_event()
+ * sets dev->next_event only when the event is really
+ * programmed to the device.
+ */
+ for (next = dev->next_event; ;) {
+ next = ktime_add(next, tick_period);
+
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ tick_do_periodic_broadcast();
+ }
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+static void tick_do_broadcast_on_off(void *why)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags, *reason = why;
+ int cpu, bc_stopped;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+ bc = tick_broadcast_device.evtdev;
+
+ /*
+ * Is the device not affected by the powerstate ?
+ */
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ goto out;
+
+ if (!tick_device_is_functional(dev))
+ goto out;
+
+ bc_stopped = cpus_empty(tick_broadcast_mask);
+
+ switch (*reason) {
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ if (!cpu_isset(cpu, tick_broadcast_mask)) {
+ cpu_set(cpu, tick_broadcast_mask);
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ clockevents_shutdown(dev);
+ }
+ if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
+ tick_broadcast_force = 1;
+ break;
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ if (!tick_broadcast_force &&
+ cpu_isset(cpu, tick_broadcast_mask)) {
+ cpu_clear(cpu, tick_broadcast_mask);
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(dev, 0);
+ }
+ break;
+ }
+
+ if (cpus_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
+out:
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop.
+ */
+void tick_broadcast_on_off(unsigned long reason, int *oncpu)
+{
+ if (!cpu_isset(*oncpu, cpu_online_map))
+ printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
+ "offline CPU #%d\n", *oncpu);
+ else
+ smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
+ &reason, 1);
+}
+
+/*
+ * Set the periodic handler depending on broadcast on/off
+ */
+void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ if (!broadcast)
+ dev->event_handler = tick_handle_periodic;
+ else
+ dev->event_handler = tick_handle_periodic_broadcast;
+}
+
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_shutdown_broadcast(unsigned int *cpup)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ cpu_clear(cpu, tick_broadcast_mask);
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ if (bc && cpus_empty(tick_broadcast_mask))
+ clockevents_shutdown(bc);
+ }
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+void tick_suspend_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ clockevents_shutdown(bc);
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+int tick_resume_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ int broadcast = 0;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc) {
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_PERIODIC:
+ if(!cpus_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(bc);
+ broadcast = cpu_isset(smp_processor_id(),
+ tick_broadcast_mask);
+ break;
+ case TICKDEV_MODE_ONESHOT:
+ broadcast = tick_resume_broadcast_oneshot(bc);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+
+ return broadcast;
+}
+
+
+#ifdef CONFIG_TICK_ONESHOT
+
+static cpumask_t tick_broadcast_oneshot_mask;
+
+/*
+ * Debugging: see timer_list.c
+ */
+cpumask_t *tick_get_broadcast_oneshot_mask(void)
+{
+ return &tick_broadcast_oneshot_mask;
+}
+
+static int tick_broadcast_set_event(ktime_t expires, int force)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return tick_dev_program_event(bc, expires, force);
+}
+
+int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ return 0;
+}
+
+/*
+ * Called from irq_enter() when idle was interrupted to reenable the
+ * per cpu device.
+ */
+void tick_check_oneshot_broadcast(int cpu)
+{
+ if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ }
+}
+
+/*
+ * Handle oneshot mode broadcasting
+ */
+static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
+{
+ struct tick_device *td;
+ cpumask_t mask;
+ ktime_t now, next_event;
+ int cpu;
+
+ spin_lock(&tick_broadcast_lock);
+again:
+ dev->next_event.tv64 = KTIME_MAX;
+ next_event.tv64 = KTIME_MAX;
+ mask = CPU_MASK_NONE;
+ now = ktime_get();
+ /* Find all expired events */
+ for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev->next_event.tv64 <= now.tv64)
+ cpu_set(cpu, mask);
+ else if (td->evtdev->next_event.tv64 < next_event.tv64)
+ next_event.tv64 = td->evtdev->next_event.tv64;
+ }
+
+ /*
+ * Wakeup the cpus which have an expired event.
+ */
+ tick_do_broadcast(mask);
+
+ /*
+ * Two reasons for reprogram:
+ *
+ * - The global event did not expire any CPU local
+ * events. This happens in dyntick mode, as the maximum PIT
+ * delta is quite small.
+ *
+ * - There are pending events on sleeping CPUs which were not
+ * in the event mask
+ */
+ if (next_event.tv64 != KTIME_MAX) {
+ /*
+ * Rearm the broadcast device. If event expired,
+ * repeat the above
+ */
+ if (tick_broadcast_set_event(next_event, 0))
+ goto again;
+ }
+ spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+void tick_broadcast_oneshot_control(unsigned long reason)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags;
+ int cpu;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Periodic mode does not care about the enter/exit of power
+ * states
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ goto out;
+
+ bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ goto out;
+
+ if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ cpu_set(cpu, tick_broadcast_oneshot_mask);
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(dev->next_event, 1);
+ }
+ } else {
+ if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ cpu_clear(cpu, tick_broadcast_oneshot_mask);
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ if (dev->next_event.tv64 != KTIME_MAX)
+ tick_program_event(dev->next_event, 1);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Reset the one shot broadcast for a cpu
+ *
+ * Called with tick_broadcast_lock held
+ */
+static void tick_broadcast_clear_oneshot(int cpu)
+{
+ cpu_clear(cpu, tick_broadcast_oneshot_mask);
+}
+
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+ struct tick_device *td;
+ int cpu;
+
+ for_each_cpu_mask_nr(cpu, *mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev)
+ td->evtdev->next_event = expires;
+ }
+}
+
+/**
+ * tick_broadcast_setup_oneshot - setup the broadcast device
+ */
+void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ /* Set it up only once ! */
+ if (bc->event_handler != tick_handle_oneshot_broadcast) {
+ int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int cpu = smp_processor_id();
+ cpumask_t mask;
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
+ /* Take the do_timer update */
+ tick_do_timer_cpu = cpu;
+
+ /*
+ * We must be careful here. There might be other CPUs
+ * waiting for periodic broadcast. We need to set the
+ * oneshot_mask bits for those and program the
+ * broadcast device to fire.
+ */
+ mask = tick_broadcast_mask;
+ cpu_clear(cpu, mask);
+ cpus_or(tick_broadcast_oneshot_mask,
+ tick_broadcast_oneshot_mask, mask);
+
+ if (was_periodic && !cpus_empty(mask)) {
+ tick_broadcast_init_next_event(&mask, tick_next_period);
+ tick_broadcast_set_event(tick_next_period, 1);
+ } else
+ bc->next_event.tv64 = KTIME_MAX;
+ }
+}
+
+/*
+ * Select oneshot operating mode for the broadcast device
+ */
+void tick_broadcast_switch_to_oneshot(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ tick_broadcast_setup_oneshot(bc);
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+
+/*
+ * Remove a dead CPU from broadcasting
+ */
+void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+{
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Clear the broadcast mask flag for the dead cpu, but do not
+ * stop the broadcast device!
+ */
+ cpu_clear(cpu, tick_broadcast_oneshot_mask);
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+ return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
+#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 0000000..df12434
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,394 @@
+/*
+ * linux/kernel/time/tick-common.c
+ *
+ * This file contains the base functions to manage periodic tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+/*
+ * Tick devices
+ */
+DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+/*
+ * Tick next event: keeps track of the tick time
+ */
+ktime_t tick_next_period;
+ktime_t tick_period;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+DEFINE_SPINLOCK(tick_device_lock);
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_device(int cpu)
+{
+ return &per_cpu(tick_cpu_device, cpu);
+}
+
+/**
+ * tick_is_oneshot_available - check for a oneshot capable event device
+ */
+int tick_is_oneshot_available(void)
+{
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+
+ return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+}
+
+/*
+ * Periodic tick
+ */
+static void tick_periodic(int cpu)
+{
+ if (tick_do_timer_cpu == cpu) {
+ write_seqlock(&xtime_lock);
+
+ /* Keep track of the next tick event */
+ tick_next_period = ktime_add(tick_next_period, tick_period);
+
+ do_timer(1);
+ write_sequnlock(&xtime_lock);
+ }
+
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
+
+/*
+ * Event handler for periodic ticks
+ */
+void tick_handle_periodic(struct clock_event_device *dev)
+{
+ int cpu = smp_processor_id();
+ ktime_t next;
+
+ tick_periodic(cpu);
+
+ if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ return;
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ next = ktime_add(dev->next_event, tick_period);
+ for (;;) {
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ tick_periodic(cpu);
+ next = ktime_add(next, tick_period);
+ }
+}
+
+/*
+ * Setup the device for a periodic tick
+ */
+void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
+{
+ tick_set_periodic_handler(dev, broadcast);
+
+ /* Broadcast setup ? */
+ if (!tick_device_is_functional(dev))
+ return;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !tick_broadcast_oneshot_active()) {
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ } else {
+ unsigned long seq;
+ ktime_t next;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ next = tick_next_period;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+
+ for (;;) {
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ next = ktime_add(next, tick_period);
+ }
+ }
+}
+
+/*
+ * Setup the tick device
+ */
+static void tick_setup_device(struct tick_device *td,
+ struct clock_event_device *newdev, int cpu,
+ const cpumask_t *cpumask)
+{
+ ktime_t next_event;
+ void (*handler)(struct clock_event_device *) = NULL;
+
+ /*
+ * First device setup ?
+ */
+ if (!td->evtdev) {
+ /*
+ * If no cpu took the do_timer update, assign it to
+ * this cpu:
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ tick_do_timer_cpu = cpu;
+ tick_next_period = ktime_get();
+ tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ }
+
+ /*
+ * Startup in periodic mode first.
+ */
+ td->mode = TICKDEV_MODE_PERIODIC;
+ } else {
+ handler = td->evtdev->event_handler;
+ next_event = td->evtdev->next_event;
+ td->evtdev->event_handler = clockevents_handle_noop;
+ }
+
+ td->evtdev = newdev;
+
+ /*
+ * When the device is not per cpu, pin the interrupt to the
+ * current cpu:
+ */
+ if (!cpus_equal(newdev->cpumask, *cpumask))
+ irq_set_affinity(newdev->irq, *cpumask);
+
+ /*
+ * When global broadcasting is active, check if the current
+ * device is registered as a placeholder for broadcast mode.
+ * This allows us to handle this x86 misfeature in a generic
+ * way.
+ */
+ if (tick_device_uses_broadcast(newdev, cpu))
+ return;
+
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(newdev, 0);
+ else
+ tick_setup_oneshot(newdev, handler, next_event);
+}
+
+/*
+ * Check, if the new registered device should be used.
+ */
+static int tick_check_new_device(struct clock_event_device *newdev)
+{
+ struct clock_event_device *curdev;
+ struct tick_device *td;
+ int cpu, ret = NOTIFY_OK;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+
+ cpu = smp_processor_id();
+ if (!cpu_isset(cpu, newdev->cpumask))
+ goto out_bc;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ curdev = td->evtdev;
+
+ /* cpu local device ? */
+ if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
+
+ /*
+ * If the cpu affinity of the device interrupt can not
+ * be set, ignore it.
+ */
+ if (!irq_can_set_affinity(newdev->irq))
+ goto out_bc;
+
+ /*
+ * If we have a cpu local device already, do not replace it
+ * by a non cpu local device
+ */
+ if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
+ goto out_bc;
+ }
+
+ /*
+ * If we have an active device, then check the rating and the oneshot
+ * feature.
+ */
+ if (curdev) {
+ /*
+ * Prefer one shot capable devices !
+ */
+ if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ goto out_bc;
+ /*
+ * Check the rating
+ */
+ if (curdev->rating >= newdev->rating)
+ goto out_bc;
+ }
+
+ /*
+ * Replace the eventually existing device by the new
+ * device. If the current device is the broadcast device, do
+ * not give it back to the clockevents layer !
+ */
+ if (tick_is_broadcast_device(curdev)) {
+ clockevents_shutdown(curdev);
+ curdev = NULL;
+ }
+ clockevents_exchange_device(curdev, newdev);
+ tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+ return NOTIFY_STOP;
+
+out_bc:
+ /*
+ * Can the new device be used as a broadcast device ?
+ */
+ if (tick_check_broadcast_device(newdev))
+ ret = NOTIFY_STOP;
+
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Shutdown an event device on a given cpu:
+ *
+ * This is called on a life CPU, when a CPU is dead. So we cannot
+ * access the hardware device itself.
+ * We just set the mode and remove it from the lists.
+ */
+static void tick_shutdown(unsigned int *cpup)
+{
+ struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct clock_event_device *dev = td->evtdev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+ td->mode = TICKDEV_MODE_PERIODIC;
+ if (dev) {
+ /*
+ * Prevent that the clock events layer tries to call
+ * the set mode function!
+ */
+ dev->mode = CLOCK_EVT_MODE_UNUSED;
+ clockevents_exchange_device(dev, NULL);
+ td->evtdev = NULL;
+ }
+ /* Transfer the do_timer job away from this cpu */
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = first_cpu(cpu_online_map);
+
+ tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+static void tick_suspend(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+ clockevents_shutdown(td->evtdev);
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+static void tick_resume(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ unsigned long flags;
+ int broadcast = tick_resume_broadcast();
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+
+ if (!broadcast) {
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(td->evtdev, 0);
+ else
+ tick_resume_oneshot();
+ }
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+/*
+ * Notification about clock event devices
+ */
+static int tick_notify(struct notifier_block *nb, unsigned long reason,
+ void *dev)
+{
+ switch (reason) {
+
+ case CLOCK_EVT_NOTIFY_ADD:
+ return tick_check_new_device(dev);
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ tick_broadcast_on_off(reason, dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+ case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+ tick_broadcast_oneshot_control(reason);
+ break;
+
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ tick_shutdown_broadcast_oneshot(dev);
+ tick_shutdown_broadcast(dev);
+ tick_shutdown(dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_SUSPEND:
+ tick_suspend();
+ tick_suspend_broadcast();
+ break;
+
+ case CLOCK_EVT_NOTIFY_RESUME:
+ tick_resume();
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tick_notifier = {
+ .notifier_call = tick_notify,
+};
+
+/**
+ * tick_init - initialize the tick control
+ *
+ * Register the notifier with the clockevents framework
+ */
+void __init tick_init(void)
+{
+ clockevents_register_notifier(&tick_notifier);
+}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 0000000..b1c05bf
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,135 @@
+/*
+ * tick internal variable and functions used by low/high res code
+ */
+
+#define TICK_DO_TIMER_NONE -1
+#define TICK_DO_TIMER_BOOT -2
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern spinlock_t tick_device_lock;
+extern ktime_t tick_next_period;
+extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
+
+extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
+extern void tick_handle_periodic(struct clock_event_device *dev);
+
+extern void clockevents_shutdown(struct clock_event_device *dev);
+
+/*
+ * NO_HZ / high resolution timer shared code
+ */
+#ifdef CONFIG_TICK_ONESHOT
+extern void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+ ktime_t expires, int force);
+extern int tick_program_event(ktime_t expires, int force);
+extern void tick_oneshot_notify(void);
+extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
+extern void tick_resume_oneshot(void);
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern void tick_broadcast_switch_to_oneshot(void);
+extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
+extern void tick_check_oneshot_broadcast(int cpu);
+# else /* BROADCAST */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_broadcast_switch_to_oneshot(void) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline void tick_check_oneshot_broadcast(int cpu) { }
+# endif /* !BROADCAST */
+
+#else /* !ONESHOT */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt)
+{
+ BUG();
+}
+static inline void tick_resume_oneshot(void)
+{
+ BUG();
+}
+static inline int tick_program_event(ktime_t expires, int force)
+{
+ return 0;
+}
+static inline void tick_oneshot_notify(void) { }
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ return 0;
+}
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+#endif /* !TICK_ONESHOT */
+
+/*
+ * Broadcasting support
+ */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
+extern void tick_shutdown_broadcast(unsigned int *cpup);
+extern void tick_suspend_broadcast(void);
+extern int tick_resume_broadcast(void);
+
+extern void
+tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+
+#else /* !BROADCAST */
+
+static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+
+static inline int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
+ int cpu)
+{
+ return 0;
+}
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
+static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline int tick_resume_broadcast(void) { return 0; }
+
+/*
+ * Set the periodic handler in non broadcast mode
+ */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev,
+ int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+#endif /* !BROADCAST */
+
+/*
+ * Check, if the device is functional or a dummy for broadcast
+ */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 0000000..2e8de67
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,141 @@
+/*
+ * linux/kernel/time/tick-oneshot.c
+ *
+ * This file contains functions which manage high resolution tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/**
+ * tick_program_event internal worker function
+ */
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+ int force)
+{
+ ktime_t now = ktime_get();
+ int i;
+
+ for (i = 0;;) {
+ int ret = clockevents_program_event(dev, expires, now);
+
+ if (!ret || !force)
+ return ret;
+
+ /*
+ * We tried 2 times to program the device with the given
+ * min_delta_ns. If that's not working then we double it
+ * and emit a warning.
+ */
+ if (++i > 2) {
+ /* Increase the min. delta and try again */
+ if (!dev->min_delta_ns)
+ dev->min_delta_ns = 5000;
+ else
+ dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+ printk(KERN_WARNING
+ "CE: %s increasing min_delta_ns to %lu nsec\n",
+ dev->name ? dev->name : "?",
+ dev->min_delta_ns << 1);
+
+ i = 0;
+ }
+
+ now = ktime_get();
+ expires = ktime_add_ns(now, dev->min_delta_ns);
+ }
+}
+
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+
+ return tick_dev_program_event(dev, expires, force);
+}
+
+/**
+ * tick_resume_onshot - resume oneshot mode
+ */
+void tick_resume_oneshot(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ tick_program_event(ktime_get(), 1);
+}
+
+/**
+ * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ */
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t next_event)
+{
+ newdev->event_handler = handler;
+ clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ tick_dev_program_event(newdev, next_event, 1);
+}
+
+/**
+ * tick_switch_to_oneshot - switch to oneshot mode
+ */
+int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
+ !tick_device_is_functional(dev)) {
+
+ printk(KERN_INFO "Clockevents: "
+ "could not switch to one-shot mode:");
+ if (!dev) {
+ printk(" no tick device\n");
+ } else {
+ if (!tick_device_is_functional(dev))
+ printk(" %s is not functional.\n", dev->name);
+ else
+ printk(" %s does not support one-shot mode.\n",
+ dev->name);
+ }
+ return -EINVAL;
+ }
+
+ td->mode = TICKDEV_MODE_ONESHOT;
+ dev->event_handler = handler;
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ tick_broadcast_switch_to_oneshot();
+ return 0;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * tick_init_highres - switch to high resolution mode
+ *
+ * Called with interrupts disabled.
+ */
+int tick_init_highres(void)
+{
+ return tick_switch_to_oneshot(hrtimer_interrupt);
+}
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 0000000..342fc9c
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,770 @@
+/*
+ * linux/kernel/time/tick-sched.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * No idle tick implementation for low and high resolution timers
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * Distribute under GPLv2.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/module.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+/*
+ * Per cpu nohz control structure
+ */
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+
+/*
+ * The time, when the last jiffy update happened. Protected by xtime_lock.
+ */
+static ktime_t last_jiffies_update;
+
+struct tick_sched *tick_get_tick_sched(int cpu)
+{
+ return &per_cpu(tick_cpu_sched, cpu);
+}
+
+/*
+ * Must be called with interrupts disabled !
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+ unsigned long ticks = 0;
+ ktime_t delta;
+
+ /*
+ * Do a quick check without holding xtime_lock:
+ */
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 < tick_period.tv64)
+ return;
+
+ /* Reevalute with xtime_lock held */
+ write_seqlock(&xtime_lock);
+
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 >= tick_period.tv64) {
+
+ delta = ktime_sub(delta, tick_period);
+ last_jiffies_update = ktime_add(last_jiffies_update,
+ tick_period);
+
+ /* Slow path for long timeouts */
+ if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ s64 incr = ktime_to_ns(tick_period);
+
+ ticks = ktime_divns(delta, incr);
+
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ }
+ do_timer(++ticks);
+
+ /* Keep the tick_next_period variable up to date */
+ tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ }
+ write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Initialize and return retrieve the jiffies update.
+ */
+static ktime_t tick_init_jiffy_update(void)
+{
+ ktime_t period;
+
+ write_seqlock(&xtime_lock);
+ /* Did we start the jiffies update yet ? */
+ if (last_jiffies_update.tv64 == 0)
+ last_jiffies_update = tick_next_period;
+ period = last_jiffies_update;
+ write_sequnlock(&xtime_lock);
+ return period;
+}
+
+/*
+ * NOHZ - aka dynamic tick functionality
+ */
+#ifdef CONFIG_NO_HZ
+/*
+ * NO HZ enabled ?
+ */
+static int tick_nohz_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable tickless mode
+ */
+static int __init setup_tick_nohz(char *str)
+{
+ if (!strcmp(str, "off"))
+ tick_nohz_enabled = 0;
+ else if (!strcmp(str, "on"))
+ tick_nohz_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("nohz=", setup_tick_nohz);
+
+/**
+ * tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ *
+ * Called from interrupt entry when the CPU was idle
+ *
+ * In case the sched_tick was stopped on this CPU, we have to check if jiffies
+ * must be updated. Otherwise an interrupt handler could use a stale jiffy
+ * value. We do this unconditionally on any cpu, as we don't know whether the
+ * cpu, which has the update task assigned is in a long sleep.
+ */
+void tick_nohz_update_jiffies(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long flags;
+ ktime_t now;
+
+ if (!ts->tick_stopped)
+ return;
+
+ cpu_clear(cpu, nohz_cpu_mask);
+ now = ktime_get();
+ ts->idle_waketime = now;
+
+ local_irq_save(flags);
+ tick_do_update_jiffies64(now);
+ local_irq_restore(flags);
+
+ touch_softlockup_watchdog();
+}
+
+static void tick_nohz_stop_idle(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (ts->idle_active) {
+ ktime_t now, delta;
+ now = ktime_get();
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_lastupdate = now;
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_active = 0;
+
+ sched_clock_idle_wakeup_event(0);
+ }
+}
+
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+{
+ ktime_t now, delta;
+
+ now = ktime_get();
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_lastupdate = now;
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ }
+ ts->idle_entrytime = now;
+ ts->idle_active = 1;
+ sched_clock_idle_sleep_event();
+ return now;
+}
+
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ if (ts->idle_active)
+ *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ else
+ *last_update_time = ktime_to_us(ktime_get());
+
+ return ktime_to_us(ts->idle_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called either from the idle loop or from irq_exit() when an idle period was
+ * just interrupted by an interrupt which did not cause a reschedule.
+ */
+void tick_nohz_stop_sched_tick(int inidle)
+{
+ unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+ struct tick_sched *ts;
+ ktime_t last_update, expires, now;
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ int cpu;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ ts = &per_cpu(tick_cpu_sched, cpu);
+ now = tick_nohz_start_idle(ts);
+
+ /*
+ * If this cpu is offline and it is the one which updates
+ * jiffies, then give up the assignment and let it be taken by
+ * the cpu which runs the tick timer next. If we don't drop
+ * this here the jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (unlikely(!cpu_online(cpu))) {
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ }
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ goto end;
+
+ if (!inidle && !ts->inidle)
+ goto end;
+
+ ts->inidle = 1;
+
+ if (need_resched())
+ goto end;
+
+ if (unlikely(local_softirq_pending())) {
+ static int ratelimit;
+
+ if (ratelimit < 10) {
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ local_softirq_pending());
+ ratelimit++;
+ }
+ goto end;
+ }
+
+ ts->idle_calls++;
+ /* Read jiffies and the time when jiffies were updated last */
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ last_update = last_jiffies_update;
+ last_jiffies = jiffies;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ /* Get the next timer wheel timer */
+ next_jiffies = get_next_timer_interrupt(last_jiffies);
+ delta_jiffies = next_jiffies - last_jiffies;
+
+ if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
+ delta_jiffies = 1;
+ /*
+ * Do not stop the tick, if we are only one off
+ * or if the cpu is required for rcu
+ */
+ if (!ts->tick_stopped && delta_jiffies == 1)
+ goto out;
+
+ /* Schedule the tick, if we are at least one jiffie off */
+ if ((long)delta_jiffies >= 1) {
+
+ if (delta_jiffies > 1)
+ cpu_set(cpu, nohz_cpu_mask);
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ if (select_nohz_load_balancer(1)) {
+ /*
+ * sched tick not stopped!
+ */
+ cpu_clear(cpu, nohz_cpu_mask);
+ goto out;
+ }
+
+ ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ ts->idle_jiffies = last_jiffies;
+ rcu_enter_nohz();
+ }
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+
+ ts->idle_sleeps++;
+
+ /*
+ * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+ * there is no timer pending or at least extremly far
+ * into the future (12 days for HZ=1000). In this case
+ * we simply stop the tick timer:
+ */
+ if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+ ts->idle_expires.tv64 = KTIME_MAX;
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
+ }
+
+ /*
+ * calculate the expiry time for the next timer wheel
+ * timer
+ */
+ expires = ktime_add_ns(last_update, tick_period.tv64 *
+ delta_jiffies);
+ ts->idle_expires = expires;
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer, expires,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ goto out;
+ } else if (!tick_program_event(expires, 0))
+ goto out;
+ /*
+ * We are past the event already. So we crossed a
+ * jiffie boundary. Update jiffies and raise the
+ * softirq.
+ */
+ tick_do_update_jiffies64(ktime_get());
+ cpu_clear(cpu, nohz_cpu_mask);
+ }
+ raise_softirq_irqoff(TIMER_SOFTIRQ);
+out:
+ ts->next_jiffies = next_jiffies;
+ ts->last_jiffies = last_jiffies;
+ ts->sleep_length = ktime_sub(dev->next_event, now);
+end:
+ local_irq_restore(flags);
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the length of the current sleep
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_sleep_length(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ return ts->sleep_length;
+}
+
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+
+ while (1) {
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ } else {
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
+ break;
+ }
+ /* Update jiffies and reread time */
+ tick_do_update_jiffies64(now);
+ now = ktime_get();
+ }
+}
+
+/**
+ * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ */
+void tick_nohz_restart_sched_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long ticks;
+ ktime_t now;
+
+ local_irq_disable();
+ tick_nohz_stop_idle(cpu);
+
+ if (!ts->inidle || !ts->tick_stopped) {
+ ts->inidle = 0;
+ local_irq_enable();
+ return;
+ }
+
+ ts->inidle = 0;
+
+ rcu_exit_nohz();
+
+ /* Update jiffies first */
+ select_nohz_load_balancer(0);
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ cpu_clear(cpu, nohz_cpu_mask);
+
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX) {
+ add_preempt_count(HARDIRQ_OFFSET);
+ account_system_time(current, HARDIRQ_OFFSET,
+ jiffies_to_cputime(ticks));
+ sub_preempt_count(HARDIRQ_OFFSET);
+ }
+
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+
+ local_irq_enable();
+}
+
+static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
+}
+
+/*
+ * The nohz low res interrupt handler
+ */
+static void tick_nohz_handler(struct clock_event_device *dev)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct pt_regs *regs = get_irq_regs();
+ int cpu = smp_processor_id();
+ ktime_t now = ktime_get();
+
+ dev->next_event.tv64 = KTIME_MAX;
+
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ tick_do_timer_cpu = cpu;
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start
+ * of idle" jiffy stamp so the idle accounting adjustment we
+ * do when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+
+ while (tick_nohz_reprogram(ts, now)) {
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+/**
+ * tick_nohz_switch_to_nohz - switch to nohz mode
+ */
+static void tick_nohz_switch_to_nohz(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t next;
+
+ if (!tick_nohz_enabled)
+ return;
+
+ local_irq_disable();
+ if (tick_switch_to_oneshot(tick_nohz_handler)) {
+ local_irq_enable();
+ return;
+ }
+
+ ts->nohz_mode = NOHZ_MODE_LOWRES;
+
+ /*
+ * Recycle the hrtimer in ts, so we can share the
+ * hrtimer_forward with the highres code.
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ /* Get the next period */
+ next = tick_init_jiffy_update();
+
+ for (;;) {
+ hrtimer_set_expires(&ts->sched_timer, next);
+ if (!tick_program_event(next, 0))
+ break;
+ next = ktime_add(next, tick_period);
+ }
+ local_irq_enable();
+
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
+ smp_processor_id());
+}
+
+/*
+ * When NOHZ is enabled and the tick is stopped, we need to kick the
+ * tick timer from irq_enter() so that the jiffies update is kept
+ * alive during long running softirqs. That's ugly as hell, but
+ * correctness is key even if we need to fix the offending softirq in
+ * the first place.
+ *
+ * Note, this is different to tick_nohz_restart. We just kick the
+ * timer and do not touch the other magic bits which need to be done
+ * when idle is left.
+ */
+static void tick_nohz_kick_tick(int cpu)
+{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t delta, now;
+
+ if (!ts->tick_stopped)
+ return;
+
+ /*
+ * Do not touch the tick device, when the next expiry is either
+ * already reached or less/equal than the tick period.
+ */
+ now = ktime_get();
+ delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ if (delta.tv64 <= tick_period.tv64)
+ return;
+
+ tick_nohz_restart(ts, now);
+#endif
+}
+
+#else
+
+static inline void tick_nohz_switch_to_nohz(void) { }
+
+#endif /* NO_HZ */
+
+/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+ tick_check_oneshot_broadcast(cpu);
+#ifdef CONFIG_NO_HZ
+ tick_nohz_stop_idle(cpu);
+ tick_nohz_update_jiffies();
+ tick_nohz_kick_tick(cpu);
+#endif
+}
+
+/*
+ * High resolution timer specific code
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * We rearm the timer until we get disabled by the idle code.
+ * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ */
+static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+{
+ struct tick_sched *ts =
+ container_of(timer, struct tick_sched, sched_timer);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ tick_do_timer_cpu = cpu;
+#endif
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+
+ /*
+ * Do not call, when we are not in irq context and have
+ * no valid regs pointer
+ */
+ if (regs) {
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start of
+ * idle" jiffy stamp so the idle accounting adjustment we do
+ * when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+ }
+
+ hrtimer_forward(timer, now, tick_period);
+
+ return HRTIMER_RESTART;
+}
+
+/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+void tick_setup_sched_timer(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t now = ktime_get();
+ u64 offset;
+
+ /*
+ * Emulate tick processing via per-CPU hrtimers:
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ts->sched_timer.function = tick_sched_timer;
+ ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
+
+ /* Get the next period (per cpu) */
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+ offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= smp_processor_id();
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+
+ for (;;) {
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ now = ktime_get();
+ }
+
+#ifdef CONFIG_NO_HZ
+ if (tick_nohz_enabled)
+ ts->nohz_mode = NOHZ_MODE_HIGHRES;
+#endif
+}
+#endif /* HIGH_RES_TIMERS */
+
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+void tick_cancel_sched_timer(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+# ifdef CONFIG_HIGH_RES_TIMERS
+ if (ts->sched_timer.base)
+ hrtimer_cancel(&ts->sched_timer);
+# endif
+
+ ts->nohz_mode = NOHZ_MODE_INACTIVE;
+}
+#endif
+
+/**
+ * Async notification about clocksource changes
+ */
+void tick_clock_notify(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
+}
+
+/*
+ * Async notification about clock event changes
+ */
+void tick_oneshot_notify(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ set_bit(0, &ts->check_clocks);
+}
+
+/**
+ * Check, if a change happened, which makes oneshot possible.
+ *
+ * Called cyclic from the hrtimer softirq (driven by the timer
+ * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * mode, because high resolution timers are disabled (either compile
+ * or runtime).
+ */
+int tick_check_oneshot_change(int allow_nohz)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!test_and_clear_bit(0, &ts->check_clocks))
+ return 0;
+
+ if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+ return 0;
+
+ if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
+ return 0;
+
+ if (!allow_nohz)
+ return 1;
+
+ tick_nohz_switch_to_nohz();
+ return 0;
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 0000000..900f1b6
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,607 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time. Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ *
+ * wall_to_monotonic is moved after resume from suspend for the monotonic
+ * time not to jump. We need to add total_sleep_time to wall_to_monotonic
+ * to get the real boot based time offset.
+ *
+ * - wall_to_monotonic is no longer the boot time, getboottime must be
+ * used instead.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static unsigned long total_sleep_time; /* seconds */
+
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
+static struct timespec xtime_cache __attribute__ ((aligned (16)));
+void update_xtime_cache(u64 nsec)
+{
+ xtime_cache = xtime;
+ timespec_add_ns(&xtime_cache, nsec);
+}
+
+struct clocksource *clock;
+
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * clocksource_forward_now - update clock to the current time
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void clocksource_forward_now(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 nsec;
+
+ cycle_now = clocksource_read(clock);
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ clock->cycle_last = cycle_now;
+
+ nsec = cyc2ns(clock, cycle_delta);
+ timespec_add_ns(&xtime, nsec);
+
+ nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+ clock->raw_time.tv_nsec += nsec;
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ cycle_t cycle_now, cycle_delta;
+ unsigned long seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts = xtime;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs = cyc2ns(clock, cycle_delta);
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using getnstimeofday()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ struct timespec ts_delta;
+ unsigned long flags;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ clocksource_forward_now();
+
+ ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+ ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
+
+ xtime = *tv;
+
+ update_xtime_cache(0);
+
+ clock->error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, clock);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+ struct clocksource *new;
+
+ new = clocksource_get_next();
+
+ if (clock == new)
+ return;
+
+ clocksource_forward_now();
+
+ new->raw_time = clock->raw_time;
+
+ clock = new;
+ clock->cycle_last = 0;
+ clock->cycle_last = clocksource_read(new);
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ /*
+ * We're holding xtime lock and waking up klogd would deadlock
+ * us on enqueue. So no printing!
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+ */
+}
+#else
+static inline void clocksource_forward_now(void) { }
+static inline void change_clocksource(void) { }
+#endif
+
+/**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+ cycle_t cycle_now, cycle_delta;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+
+ *ts = clock->raw_time;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+
+
+/**
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
+ */
+int timekeeping_valid_for_hres(void)
+{
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+ return 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ unsigned long flags;
+ unsigned long sec = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_init();
+
+ clock = clocksource_get_next();
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ clock->cycle_last = clocksource_read(clock);
+
+ xtime.tv_sec = sec;
+ xtime.tv_nsec = 0;
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+ update_xtime_cache(0);
+ total_sleep_time = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev: unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+ unsigned long flags;
+ unsigned long now = read_persistent_clock();
+
+ clocksource_resume();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ if (now && (now > timekeeping_suspend_time)) {
+ unsigned long sleep_length = now - timekeeping_suspend_time;
+
+ xtime.tv_sec += sleep_length;
+ wall_to_monotonic.tv_sec -= sleep_length;
+ total_sleep_time += sleep_length;
+ }
+ update_xtime_cache(0);
+ /* re-base the last cycle value */
+ clock->cycle_last = 0;
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ touch_softlockup_watchdog();
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+ /* Resume hrtimers */
+ hres_timers_resume();
+
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ timekeeping_suspend_time = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ clocksource_forward_now();
+ timekeeping_suspended = 1;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+ .name = "timekeeping",
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+ int error = sysdev_class_register(&timekeeping_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adjusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) <<
+ (NTP_SCALE_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+ cycle_t offset;
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
+
+#ifdef CONFIG_GENERIC_TIME
+ offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+ offset = clock->cycle_interval;
+#endif
+ clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+
+ /* normally this loop will run just once, however in the
+ * case of lost or late ticks, it will accumulate correctly.
+ */
+ while (offset >= clock->cycle_interval) {
+ /* accumulate one interval */
+ offset -= clock->cycle_interval;
+ clock->cycle_last += clock->cycle_interval;
+
+ clock->xtime_nsec += clock->xtime_interval;
+ if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+ clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ clock->raw_time.tv_nsec += clock->raw_interval;
+ if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+ clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+ clock->raw_time.tv_sec++;
+ }
+
+ /* accumulate error between NTP and clock interval */
+ clock->error += tick_length;
+ clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+ }
+
+ /* correct the clock when NTP error is too big */
+ clocksource_adjust(offset);
+
+ /*
+ * Since in the loop above, we accumulate any amount of time
+ * in xtime_nsec over a second into xtime.tv_sec, its possible for
+ * xtime_nsec to be fairly small after the loop. Further, if we're
+ * slightly speeding the clocksource up in clocksource_adjust(),
+ * its possible the required corrective factor to xtime_nsec could
+ * cause it to underflow.
+ *
+ * Now, we cannot simply roll the accumulated second back, since
+ * the NTP subsystem has been notified via second_overflow. So
+ * instead we push xtime_nsec forward by the amount we underflowed,
+ * and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)clock->xtime_nsec < 0)) {
+ s64 neg = -(s64)clock->xtime_nsec;
+ clock->xtime_nsec = 0;
+ clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+ }
+
+ /* store full nanoseconds into xtime after rounding it up and
+ * add the remainder to the error difference.
+ */
+ xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
+ clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+ clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+
+ update_xtime_cache(cyc2ns(clock, offset));
+
+ /* check to see if there is a new clocksource to use */
+ change_clocksource();
+ update_vsyscall(&xtime, clock);
+}
+
+/**
+ * getboottime - Return the real time of system boot.
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ *
+ * This is based on the wall_to_monotonic offset and the total suspend
+ * time. Calls to settimeofday will affect the value returned (which
+ * basically means that however wrong your real time clock is at boot time,
+ * you get the right time here).
+ */
+void getboottime(struct timespec *ts)
+{
+ set_normalized_timespec(ts,
+ - (wall_to_monotonic.tv_sec + total_sleep_time),
+ - wall_to_monotonic.tv_nsec);
+}
+
+/**
+ * monotonic_to_bootbased - Convert the monotonic time to boot based.
+ * @ts: pointer to the timespec to be converted
+ */
+void monotonic_to_bootbased(struct timespec *ts)
+{
+ ts->tv_sec += total_sleep_time;
+}
+
+unsigned long get_seconds(void)
+{
+ return xtime_cache.tv_sec;
+}
+EXPORT_SYMBOL(get_seconds);
+
+
+struct timespec current_kernel_time(void)
+{
+ struct timespec now;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = xtime_cache;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return now;
+}
+EXPORT_SYMBOL(current_kernel_time);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 0000000..a999b92
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,294 @@
+/*
+ * kernel/time/timer_list.c
+ *
+ * List pending timers
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/tick.h>
+
+#include <asm/uaccess.h>
+
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+/*
+ * This allows printing both to /proc/timer_list and
+ * to the console (on SysRq-Q):
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+static void print_name_offset(struct seq_file *m, void *sym)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
+ SEQ_printf(m, "<%p>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
+}
+
+static void
+print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+ int idx, u64 now)
+{
+#ifdef CONFIG_TIMER_STATS
+ char tmp[TASK_COMM_LEN + 1];
+#endif
+ SEQ_printf(m, " #%d: ", idx);
+ print_name_offset(m, taddr);
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->function);
+ SEQ_printf(m, ", S:%02lx", timer->state);
+#ifdef CONFIG_TIMER_STATS
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->start_site);
+ memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
+ tmp[TASK_COMM_LEN] = 0;
+ SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
+#endif
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+ (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+ (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+ (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
+}
+
+static void
+print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
+ u64 now)
+{
+ struct hrtimer *timer, tmp;
+ unsigned long next = 0, i;
+ struct rb_node *curr;
+ unsigned long flags;
+
+next_one:
+ i = 0;
+ spin_lock_irqsave(&base->cpu_base->lock, flags);
+
+ curr = base->first;
+ /*
+ * Crude but we have to do this O(N*N) thing, because
+ * we have to unlock the base when printing:
+ */
+ while (curr && i < next) {
+ curr = rb_next(curr);
+ i++;
+ }
+
+ if (curr) {
+
+ timer = rb_entry(curr, struct hrtimer, node);
+ tmp = *timer;
+ spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+
+ print_timer(m, timer, &tmp, i, now);
+ next++;
+ goto next_one;
+ }
+ spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+}
+
+static void
+print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
+{
+ SEQ_printf(m, " .base: %p\n", base);
+ SEQ_printf(m, " .index: %d\n",
+ base->index);
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
+ (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .get_time: ");
+ print_name_offset(m, base->get_time);
+ SEQ_printf(m, "\n");
+#ifdef CONFIG_HIGH_RES_TIMERS
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
+#endif
+ SEQ_printf(m, "active timers:\n");
+ print_active_timers(m, base, now);
+}
+
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cpu: %d\n", cpu);
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ SEQ_printf(m, " clock %d:\n", i);
+ print_base(m, cpu_base->clock_base + i, now);
+ }
+#define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
+#define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ P_ns(expires_next);
+ P(hres_active);
+ P(nr_events);
+#endif
+#undef P
+#undef P_ns
+
+#ifdef CONFIG_TICK_ONESHOT
+# define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
+# define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
+ {
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+ P(nohz_mode);
+ P_ns(idle_tick);
+ P(tick_stopped);
+ P(idle_jiffies);
+ P(idle_calls);
+ P(idle_sleeps);
+ P_ns(idle_entrytime);
+ P_ns(idle_waketime);
+ P_ns(idle_exittime);
+ P_ns(idle_sleeptime);
+ P(last_jiffies);
+ P(next_jiffies);
+ P_ns(idle_expires);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
+ }
+#endif
+
+#undef P
+#undef P_ns
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+static void
+print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
+{
+ struct clock_event_device *dev = td->evtdev;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
+ if (cpu < 0)
+ SEQ_printf(m, "Broadcast device\n");
+ else
+ SEQ_printf(m, "Per CPU device: %d\n", cpu);
+
+ SEQ_printf(m, "Clock Event Device: ");
+ if (!dev) {
+ SEQ_printf(m, "<NULL>\n");
+ return;
+ }
+ SEQ_printf(m, "%s\n", dev->name);
+ SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns);
+ SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns);
+ SEQ_printf(m, " mult: %lu\n", dev->mult);
+ SEQ_printf(m, " shift: %d\n", dev->shift);
+ SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " next_event: %Ld nsecs\n",
+ (unsigned long long) ktime_to_ns(dev->next_event));
+
+ SEQ_printf(m, " set_next_event: ");
+ print_name_offset(m, dev->set_next_event);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " event_handler: ");
+ print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, "\n");
+}
+
+static void timer_list_show_tickdevices(struct seq_file *m)
+{
+ int cpu;
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ print_tickdevice(m, tick_get_broadcast_device(), -1);
+ SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
+ tick_get_broadcast_mask()->bits[0]);
+#ifdef CONFIG_TICK_ONESHOT
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+ tick_get_broadcast_oneshot_mask()->bits[0]);
+#endif
+ SEQ_printf(m, "\n");
+#endif
+ for_each_online_cpu(cpu)
+ print_tickdevice(m, tick_get_device(cpu), cpu);
+ SEQ_printf(m, "\n");
+}
+#else
+static void timer_list_show_tickdevices(struct seq_file *m) { }
+#endif
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Timer List Version: v0.4\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu, now);
+
+ SEQ_printf(m, "\n");
+ timer_list_show_tickdevices(m);
+
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ timer_list_show(NULL, NULL);
+}
+
+static int timer_list_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timer_list_show, NULL);
+}
+
+static struct file_operations timer_list_fops = {
+ .open = timer_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_timer_list_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 0000000..c994530
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,423 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ * Written by Daniel Petrini <d.pensator@gmail.com>
+ * timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo [1|0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+ /*
+ * Hash list:
+ */
+ struct entry *next;
+
+ /*
+ * Hash keys:
+ */
+ void *timer;
+ void *start_func;
+ void *expire_func;
+ pid_t pid;
+
+ /*
+ * Number of timeout events:
+ */
+ unsigned long count;
+ unsigned int timer_flag;
+
+ /*
+ * We save the command-line string to preserve
+ * this information past task exit:
+ */
+ char comm[TASK_COMM_LEN + 1];
+
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_SPINLOCK(table_lock);
+
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+
+/*
+ * Collection status, active/inactive:
+ */
+static int __read_mostly active;
+
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS 10
+#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
+
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+
+static atomic_t overflow_count;
+
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
+
+#define __tstat_hashfn(entry) \
+ (((unsigned long)(entry)->timer ^ \
+ (unsigned long)(entry)->start_func ^ \
+ (unsigned long)(entry)->expire_func ^ \
+ (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
+
+#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
+
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+
+static void reset_entries(void)
+{
+ nr_entries = 0;
+ memset(entries, 0, sizeof(entries));
+ memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
+ atomic_set(&overflow_count, 0);
+}
+
+static struct entry *alloc_entry(void)
+{
+ if (nr_entries >= MAX_ENTRIES)
+ return NULL;
+
+ return entries + nr_entries++;
+}
+
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+ return entry1->timer == entry2->timer &&
+ entry1->start_func == entry2->start_func &&
+ entry1->expire_func == entry2->expire_func &&
+ entry1->pid == entry2->pid;
+}
+
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+ struct entry **head, *curr, *prev;
+
+ head = tstat_hashentry(entry);
+ curr = *head;
+
+ /*
+ * The fastpath is when the entry is already hashed,
+ * we do this with the lookup lock held, but with the
+ * table lock not held:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ return curr;
+
+ curr = curr->next;
+ }
+ /*
+ * Slowpath: allocate, set up and link a new hash entry:
+ */
+ prev = NULL;
+ curr = *head;
+
+ spin_lock(&table_lock);
+ /*
+ * Make sure we have not raced with another CPU:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ goto out_unlock;
+
+ prev = curr;
+ curr = curr->next;
+ }
+
+ curr = alloc_entry();
+ if (curr) {
+ *curr = *entry;
+ curr->count = 0;
+ curr->next = NULL;
+ memcpy(curr->comm, comm, TASK_COMM_LEN);
+
+ smp_mb(); /* Ensure that curr is initialized before insert */
+
+ if (prev)
+ prev->next = curr;
+ else
+ *head = curr;
+ }
+ out_unlock:
+ spin_unlock(&table_lock);
+
+ return curr;
+}
+
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer: pointer to either a timer_list or a hrtimer
+ * @pid: the pid of the task which set up the timer
+ * @startf: pointer to the function which did the timer setup
+ * @timerf: pointer to the timer callback function of the timer
+ * @comm: name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+ void *timerf, char *comm,
+ unsigned int timer_flag)
+{
+ /*
+ * It doesnt matter which lock we take:
+ */
+ spinlock_t *lock;
+ struct entry *entry, input;
+ unsigned long flags;
+
+ if (likely(!active))
+ return;
+
+ lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+
+ input.timer = timer;
+ input.start_func = startf;
+ input.expire_func = timerf;
+ input.pid = pid;
+ input.timer_flag = timer_flag;
+
+ spin_lock_irqsave(lock, flags);
+ if (!active)
+ goto out_unlock;
+
+ entry = tstat_lookup(&input, comm);
+ if (likely(entry))
+ entry->count++;
+ else
+ atomic_inc(&overflow_count);
+
+ out_unlock:
+ spin_unlock_irqrestore(lock, flags);
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symname) < 0)
+ seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+ struct timespec period;
+ struct entry *entry;
+ unsigned long ms;
+ long events = 0;
+ ktime_t time;
+ int i;
+
+ mutex_lock(&show_mutex);
+ /*
+ * If still active then calculate up to now:
+ */
+ if (active)
+ time_stop = ktime_get();
+
+ time = ktime_sub(time_stop, time_start);
+
+ period = ktime_to_timespec(time);
+ ms = period.tv_nsec / 1000000;
+
+ seq_puts(m, "Timer Stats Version: v0.2\n");
+ seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ if (atomic_read(&overflow_count))
+ seq_printf(m, "Overflow: %d entries\n",
+ atomic_read(&overflow_count));
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = entries + i;
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ seq_printf(m, "%4luD, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ } else {
+ seq_printf(m, " %4lu, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ }
+
+ print_name_offset(m, (unsigned long)entry->start_func);
+ seq_puts(m, " (");
+ print_name_offset(m, (unsigned long)entry->expire_func);
+ seq_puts(m, ")\n");
+
+ events += entry->count;
+ }
+
+ ms += period.tv_sec * 1000;
+ if (!ms)
+ ms = 1;
+
+ if (events && period.tv_sec)
+ seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
+ events, events * 1000 / ms,
+ (events * 1000000 / ms) % 1000);
+ else
+ seq_printf(m, "%ld total events\n", events);
+
+ mutex_unlock(&show_mutex);
+
+ return 0;
+}
+
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+ unsigned long flags;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+ /* nothing */
+ spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+ }
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ char ctl[2];
+
+ if (count != 2 || *offs)
+ return -EINVAL;
+
+ if (copy_from_user(ctl, buf, count))
+ return -EFAULT;
+
+ mutex_lock(&show_mutex);
+ switch (ctl[0]) {
+ case '0':
+ if (active) {
+ active = 0;
+ time_stop = ktime_get();
+ sync_access();
+ }
+ break;
+ case '1':
+ if (!active) {
+ reset_entries();
+ time_start = ktime_get();
+ smp_mb();
+ active = 1;
+ }
+ break;
+ default:
+ count = -EINVAL;
+ }
+ mutex_unlock(&show_mutex);
+
+ return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, tstats_show, NULL);
+}
+
+static struct file_operations tstats_fops = {
+ .open = tstats_open,
+ .read = seq_read,
+ .write = tstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void __init init_timer_stats(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ spin_lock_init(&per_cpu(lookup_lock, cpu));
+}
+
+static int __init init_tstats_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_tstats_procfs);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
new file mode 100644
index 0000000..eb51d76
--- /dev/null
+++ b/kernel/timeconst.pl
@@ -0,0 +1,378 @@
+#!/usr/bin/perl
+# -----------------------------------------------------------------------
+#
+# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
+#
+# This file is part of the Linux kernel, and is made available under
+# the terms of the GNU General Public License version 2 or (at your
+# option) any later version; incorporated herein by reference.
+#
+# -----------------------------------------------------------------------
+#
+
+#
+# Usage: timeconst.pl HZ > timeconst.h
+#
+
+# Precomputed values for systems without Math::BigInt
+# Generated by:
+# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
+%canned_values = (
+ 24 => [
+ '0xa6aaaaab','0x2aaaaaa',26,
+ 125,3,
+ '0xc49ba5e4','0x1fbe76c8b4',37,
+ 3,125,
+ '0xa2c2aaab','0xaaaa',16,
+ 125000,3,
+ '0xc9539b89','0x7fffbce4217d',47,
+ 3,125000,
+ ], 32 => [
+ '0xfa000000','0x6000000',27,
+ 125,4,
+ '0x83126e98','0xfdf3b645a',36,
+ 4,125,
+ '0xf4240000','0x0',17,
+ 31250,1,
+ '0x8637bd06','0x3fff79c842fa',46,
+ 1,31250,
+ ], 48 => [
+ '0xa6aaaaab','0x6aaaaaa',27,
+ 125,6,
+ '0xc49ba5e4','0xfdf3b645a',36,
+ 6,125,
+ '0xa2c2aaab','0x15555',17,
+ 62500,3,
+ '0xc9539b89','0x3fffbce4217d',46,
+ 3,62500,
+ ], 64 => [
+ '0xfa000000','0xe000000',28,
+ 125,8,
+ '0x83126e98','0x7ef9db22d',35,
+ 8,125,
+ '0xf4240000','0x0',18,
+ 15625,1,
+ '0x8637bd06','0x1fff79c842fa',45,
+ 1,15625,
+ ], 100 => [
+ '0xa0000000','0x0',28,
+ 10,1,
+ '0xcccccccd','0x733333333',35,
+ 1,10,
+ '0x9c400000','0x0',18,
+ 10000,1,
+ '0xd1b71759','0x1fff2e48e8a7',45,
+ 1,10000,
+ ], 122 => [
+ '0x8325c53f','0xfbcda3a',28,
+ 500,61,
+ '0xf9db22d1','0x7fbe76c8b',35,
+ 61,500,
+ '0x8012e2a0','0x3ef36',18,
+ 500000,61,
+ '0xffda4053','0x1ffffbce4217',45,
+ 61,500000,
+ ], 128 => [
+ '0xfa000000','0x1e000000',29,
+ 125,16,
+ '0x83126e98','0x3f7ced916',34,
+ 16,125,
+ '0xf4240000','0x40000',19,
+ 15625,2,
+ '0x8637bd06','0xfffbce4217d',44,
+ 2,15625,
+ ], 200 => [
+ '0xa0000000','0x0',29,
+ 5,1,
+ '0xcccccccd','0x333333333',34,
+ 1,5,
+ '0x9c400000','0x0',19,
+ 5000,1,
+ '0xd1b71759','0xfff2e48e8a7',44,
+ 1,5000,
+ ], 250 => [
+ '0x80000000','0x0',29,
+ 4,1,
+ '0x80000000','0x180000000',33,
+ 1,4,
+ '0xfa000000','0x0',20,
+ 4000,1,
+ '0x83126e98','0x7ff7ced9168',43,
+ 1,4000,
+ ], 256 => [
+ '0xfa000000','0x3e000000',30,
+ 125,32,
+ '0x83126e98','0x1fbe76c8b',33,
+ 32,125,
+ '0xf4240000','0xc0000',20,
+ 15625,4,
+ '0x8637bd06','0x7ffde7210be',43,
+ 4,15625,
+ ], 300 => [
+ '0xd5555556','0x2aaaaaaa',30,
+ 10,3,
+ '0x9999999a','0x1cccccccc',33,
+ 3,10,
+ '0xd0555556','0xaaaaa',20,
+ 10000,3,
+ '0x9d495183','0x7ffcb923a29',43,
+ 3,10000,
+ ], 512 => [
+ '0xfa000000','0x7e000000',31,
+ 125,64,
+ '0x83126e98','0xfdf3b645',32,
+ 64,125,
+ '0xf4240000','0x1c0000',21,
+ 15625,8,
+ '0x8637bd06','0x3ffef39085f',42,
+ 8,15625,
+ ], 1000 => [
+ '0x80000000','0x0',31,
+ 1,1,
+ '0x80000000','0x0',31,
+ 1,1,
+ '0xfa000000','0x0',22,
+ 1000,1,
+ '0x83126e98','0x1ff7ced9168',41,
+ 1,1000,
+ ], 1024 => [
+ '0xfa000000','0xfe000000',32,
+ 125,128,
+ '0x83126e98','0x7ef9db22',31,
+ 128,125,
+ '0xf4240000','0x3c0000',22,
+ 15625,16,
+ '0x8637bd06','0x1fff79c842f',41,
+ 16,15625,
+ ], 1200 => [
+ '0xd5555556','0xd5555555',32,
+ 5,6,
+ '0x9999999a','0x66666666',31,
+ 6,5,
+ '0xd0555556','0x2aaaaa',22,
+ 2500,3,
+ '0x9d495183','0x1ffcb923a29',41,
+ 3,2500,
+ ]
+);
+
+$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
+
+sub bint($)
+{
+ my($x) = @_;
+ return Math::BigInt->new($x);
+}
+
+#
+# Constants for division by reciprocal multiplication.
+# (bits, numerator, denominator)
+#
+sub fmul($$$)
+{
+ my ($b,$n,$d) = @_;
+
+ $n = bint($n);
+ $d = bint($d);
+
+ return scalar (($n << $b)+$d-bint(1))/$d;
+}
+
+sub fadj($$$)
+{
+ my($b,$n,$d) = @_;
+
+ $n = bint($n);
+ $d = bint($d);
+
+ $d = $d/bgcd($n, $d);
+ return scalar (($d-bint(1)) << $b)/$d;
+}
+
+sub fmuls($$$) {
+ my($b,$n,$d) = @_;
+ my($s,$m);
+ my($thres) = bint(1) << ($b-1);
+
+ $n = bint($n);
+ $d = bint($d);
+
+ for ($s = 0; 1; $s++) {
+ $m = fmul($s,$n,$d);
+ return $s if ($m >= $thres);
+ }
+ return 0;
+}
+
+# Generate a hex value if the result fits in 64 bits;
+# otherwise skip.
+sub bignum_hex($) {
+ my($x) = @_;
+ my $s = $x->as_hex();
+
+ return (length($s) > 18) ? undef : $s;
+}
+
+# Provides mul, adj, and shr factors for a specific
+# (bit, time, hz) combination
+sub muladj($$$) {
+ my($b, $t, $hz) = @_;
+ my $s = fmuls($b, $t, $hz);
+ my $m = fmul($s, $t, $hz);
+ my $a = fadj($s, $t, $hz);
+ return (bignum_hex($m), bignum_hex($a), $s);
+}
+
+# Provides numerator, denominator values
+sub numden($$) {
+ my($n, $d) = @_;
+ my $g = bgcd($n, $d);
+ return ($n/$g, $d/$g);
+}
+
+# All values for a specific (time, hz) combo
+sub conversions($$) {
+ my ($t, $hz) = @_;
+ my @val = ();
+
+ # HZ_TO_xx
+ push(@val, muladj(32, $t, $hz));
+ push(@val, numden($t, $hz));
+
+ # xx_TO_HZ
+ push(@val, muladj(32, $hz, $t));
+ push(@val, numden($hz, $t));
+
+ return @val;
+}
+
+sub compute_values($) {
+ my($hz) = @_;
+ my @val = ();
+ my $s, $m, $a, $g;
+
+ if (!$has_bigint) {
+ die "$0: HZ == $hz not canned and ".
+ "Math::BigInt not available\n";
+ }
+
+ # MSEC conversions
+ push(@val, conversions(1000, $hz));
+
+ # USEC conversions
+ push(@val, conversions(1000000, $hz));
+
+ return @val;
+}
+
+sub outputval($$)
+{
+ my($name, $val) = @_;
+ my $csuf;
+
+ if (defined($val)) {
+ if ($name !~ /SHR/) {
+ $val = "U64_C($val)";
+ }
+ printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
+ }
+}
+
+sub output($@)
+{
+ my($hz, @val) = @_;
+ my $pfx, $bit, $suf, $s, $m, $a;
+
+ print "/* Automatically generated by kernel/timeconst.pl */\n";
+ print "/* Conversion constants for HZ == $hz */\n";
+ print "\n";
+ print "#ifndef KERNEL_TIMECONST_H\n";
+ print "#define KERNEL_TIMECONST_H\n";
+ print "\n";
+
+ print "#include <linux/param.h>\n";
+ print "#include <linux/types.h>\n";
+
+ print "\n";
+ print "#if HZ != $hz\n";
+ print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
+ print "#endif\n";
+ print "\n";
+
+ foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
+ 'HZ_TO_USEC','USEC_TO_HZ') {
+ foreach $bit (32) {
+ foreach $suf ('MUL', 'ADJ', 'SHR') {
+ outputval("${pfx}_$suf$bit", shift(@val));
+ }
+ }
+ foreach $suf ('NUM', 'DEN') {
+ outputval("${pfx}_$suf", shift(@val));
+ }
+ }
+
+ print "\n";
+ print "#endif /* KERNEL_TIMECONST_H */\n";
+}
+
+# Pretty-print Perl values
+sub perlvals(@) {
+ my $v;
+ my @l = ();
+
+ foreach $v (@_) {
+ if (!defined($v)) {
+ push(@l, 'undef');
+ } elsif ($v =~ /^0x/) {
+ push(@l, "\'".$v."\'");
+ } else {
+ push(@l, $v.'');
+ }
+ }
+ return join(',', @l);
+}
+
+($hz) = @ARGV;
+
+# Use this to generate the %canned_values structure
+if ($hz eq '--can') {
+ shift(@ARGV);
+ @hzlist = sort {$a <=> $b} (@ARGV);
+
+ print "# Precomputed values for systems without Math::BigInt\n";
+ print "# Generated by:\n";
+ print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
+ print "\%canned_values = (\n";
+ my $pf = "\t";
+ foreach $hz (@hzlist) {
+ my @values = compute_values($hz);
+ print "$pf$hz => [\n";
+ while (scalar(@values)) {
+ my $bit;
+ foreach $bit (32) {
+ my $m = shift(@values);
+ my $a = shift(@values);
+ my $s = shift(@values);
+ print "\t\t", perlvals($m,$a,$s), ",\n";
+ }
+ my $n = shift(@values);
+ my $d = shift(@values);
+ print "\t\t", perlvals($n,$d), ",\n";
+ }
+ print "\t]";
+ $pf = ', ';
+ }
+ print "\n);\n";
+} else {
+ $hz += 0; # Force to number
+ if ($hz < 1) {
+ die "Usage: $0 HZ\n";
+ }
+
+ @val = @{$canned_values{$hz}};
+ if (!defined(@val)) {
+ @val = compute_values($hz);
+ }
+ output($hz, @val);
+}
+exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
new file mode 100644
index 0000000..a5eaea2
--- /dev/null
+++ b/kernel/timer.c
@@ -0,0 +1,1598 @@
+/*
+ * linux/kernel/timer.c
+ *
+ * Kernel internal timers, basic process system calls
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1999-03-10 Improved NTP compatibility by Ulrich Windl
+ * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
+ * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
+ * Copyright (C) 2000, 2001, 2002 Ingo Molnar
+ * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pid_namespace.h>
+#include <linux/notifier.h>
+#include <linux/thread_info.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/posix-timers.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include <linux/delay.h>
+#include <linux/tick.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/div64.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
+/*
+ * per-CPU timer vector definitions:
+ */
+#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
+#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+struct tvec {
+ struct list_head vec[TVN_SIZE];
+};
+
+struct tvec_root {
+ struct list_head vec[TVR_SIZE];
+};
+
+struct tvec_base {
+ spinlock_t lock;
+ struct timer_list *running_timer;
+ unsigned long timer_jiffies;
+ struct tvec_root tv1;
+ struct tvec tv2;
+ struct tvec tv3;
+ struct tvec tv4;
+ struct tvec tv5;
+} ____cacheline_aligned;
+
+struct tvec_base boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG (0x1)
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+{
+ return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+{
+ return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+ timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
+ TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+{
+ timer->base = (struct tvec_base *)((unsigned long)(new_base) |
+ tbase_get_deferrable(timer->base));
+}
+
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
+{
+ int rem;
+ unsigned long original = j;
+
+ /*
+ * We don't want all cpus firing their timers at once hitting the
+ * same lock or cachelines, so we skew each extra cpu with an extra
+ * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+ * already did this.
+ * The skew is done by adding 3*cpunr, then round, then subtract this
+ * extra offset again.
+ */
+ j += cpu * 3;
+
+ rem = j % HZ;
+
+ /*
+ * If the target jiffie is just after a whole second (which can happen
+ * due to delays of the timer irq, long irq off times etc etc) then
+ * we should round down to the whole second, not up. Use 1/4th second
+ * as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
+ */
+ if (rem < HZ/4 && !force_up) /* round down */
+ j = j - rem;
+ else /* round up */
+ j = j - rem + HZ;
+
+ /* now that we have rounded, subtract the extra skew again */
+ j -= cpu * 3;
+
+ if (j <= jiffies) /* rounding ate our timeout entirely; */
+ return original;
+ return j;
+}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative() rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+ return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
+
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
+
+static inline void set_running_timer(struct tvec_base *base,
+ struct timer_list *timer)
+{
+#ifdef CONFIG_SMP
+ base->running_timer = timer;
+#endif
+}
+
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - base->timer_jiffies;
+ struct list_head *vec;
+
+ if (idx < TVR_SIZE) {
+ int i = expires & TVR_MASK;
+ vec = base->tv1.vec + i;
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ int i = (expires >> TVR_BITS) & TVN_MASK;
+ vec = base->tv2.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ vec = base->tv3.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+ vec = base->tv4.vec + i;
+ } else if ((signed long) idx < 0) {
+ /*
+ * Can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+ } else {
+ int i;
+ /* If the timeout is larger than 0xffffffff on 64-bit
+ * architectures then we use the maximum timeout:
+ */
+ if (idx > 0xffffffffUL) {
+ idx = 0xffffffffUL;
+ expires = idx + base->timer_jiffies;
+ }
+ i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+ vec = base->tv5.vec + i;
+ }
+ /*
+ * Timers are FIFO:
+ */
+ list_add_tail(&timer->entry, vec);
+}
+
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+ if (timer->start_site)
+ return;
+
+ timer->start_site = addr;
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+}
+
+static void timer_stats_account_timer(struct timer_list *timer)
+{
+ unsigned int flag = 0;
+
+ if (unlikely(tbase_get_deferrable(timer->base)))
+ flag |= TIMER_STATS_FLAG_DEFERRABLE;
+
+ timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+ timer->function, timer->start_comm, flag);
+}
+
+#else
+static void timer_stats_account_timer(struct timer_list *timer) {}
+#endif
+
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr timer_debug_descr;
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int timer_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ del_timer_sync(timer);
+ debug_object_init(timer, &timer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. The timer was
+ * statically initialized. We just make sure that it
+ * is tracked in the object tracker.
+ */
+ if (timer->entry.next == NULL &&
+ timer->entry.prev == TIMER_ENTRY_STATIC) {
+ debug_object_init(timer, &timer_debug_descr);
+ debug_object_activate(timer, &timer_debug_descr);
+ return 0;
+ } else {
+ WARN_ON_ONCE(1);
+ }
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ WARN_ON(1);
+
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int timer_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ del_timer_sync(timer);
+ debug_object_free(timer, &timer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static struct debug_obj_descr timer_debug_descr = {
+ .name = "timer_list",
+ .fixup_init = timer_fixup_init,
+ .fixup_activate = timer_fixup_activate,
+ .fixup_free = timer_fixup_free,
+};
+
+static inline void debug_timer_init(struct timer_list *timer)
+{
+ debug_object_init(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_activate(struct timer_list *timer)
+{
+ debug_object_activate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_deactivate(struct timer_list *timer)
+{
+ debug_object_deactivate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_free(struct timer_list *timer)
+{
+ debug_object_free(timer, &timer_debug_descr);
+}
+
+static void __init_timer(struct timer_list *timer);
+
+void init_timer_on_stack(struct timer_list *timer)
+{
+ debug_object_init_on_stack(timer, &timer_debug_descr);
+ __init_timer(timer);
+}
+EXPORT_SYMBOL_GPL(init_timer_on_stack);
+
+void destroy_timer_on_stack(struct timer_list *timer)
+{
+ debug_object_free(timer, &timer_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+
+#else
+static inline void debug_timer_init(struct timer_list *timer) { }
+static inline void debug_timer_activate(struct timer_list *timer) { }
+static inline void debug_timer_deactivate(struct timer_list *timer) { }
+#endif
+
+static void __init_timer(struct timer_list *timer)
+{
+ timer->entry.next = NULL;
+ timer->base = __raw_get_cpu_var(tvec_bases);
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
+}
+
+/**
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void init_timer(struct timer_list *timer)
+{
+ debug_timer_init(timer);
+ __init_timer(timer);
+}
+EXPORT_SYMBOL(init_timer);
+
+void init_timer_deferrable(struct timer_list *timer)
+{
+ init_timer(timer);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
+
+static inline void detach_timer(struct timer_list *timer,
+ int clear_pending)
+{
+ struct list_head *entry = &timer->entry;
+
+ debug_timer_deactivate(timer);
+
+ __list_del(entry->prev, entry->next);
+ if (clear_pending)
+ entry->next = NULL;
+ entry->prev = LIST_POISON2;
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on ->tvX lists.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct tvec_base *lock_timer_base(struct timer_list *timer,
+ unsigned long *flags)
+ __acquires(timer->base->lock)
+{
+ struct tvec_base *base;
+
+ for (;;) {
+ struct tvec_base *prelock_base = timer->base;
+ base = tbase_get_base(prelock_base);
+ if (likely(base != NULL)) {
+ spin_lock_irqsave(&base->lock, *flags);
+ if (likely(prelock_base == timer->base))
+ return base;
+ /* The timer has migrated to another CPU */
+ spin_unlock_irqrestore(&base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
+int __mod_timer(struct timer_list *timer, unsigned long expires)
+{
+ struct tvec_base *base, *new_base;
+ unsigned long flags;
+ int ret = 0;
+
+ timer_stats_timer_set_start_info(timer);
+ BUG_ON(!timer->function);
+
+ base = lock_timer_base(timer, &flags);
+
+ if (timer_pending(timer)) {
+ detach_timer(timer, 0);
+ ret = 1;
+ }
+
+ debug_timer_activate(timer);
+
+ new_base = __get_cpu_var(tvec_bases);
+
+ if (base != new_base) {
+ /*
+ * We are trying to schedule the timer on the local CPU.
+ * However we can't change timer's base while it is running,
+ * otherwise del_timer_sync() can't detect that the timer's
+ * handler yet has not finished. This also guarantees that
+ * the timer is serialized wrt itself.
+ */
+ if (likely(base->running_timer != timer)) {
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ timer_set_base(timer, base);
+ }
+ }
+
+ timer->expires = expires;
+ internal_add_timer(base, timer);
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(__mod_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+ struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ unsigned long flags;
+
+ timer_stats_timer_set_start_info(timer);
+ BUG_ON(timer_pending(timer) || !timer->function);
+ spin_lock_irqsave(&base->lock, flags);
+ timer_set_base(timer, base);
+ debug_timer_activate(timer);
+ internal_add_timer(base, timer);
+ /*
+ * Check whether the other CPU is idle and needs to be
+ * triggered to reevaluate the timer wheel when nohz is
+ * active. We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to idle can not evaluate
+ * the timer wheel.
+ */
+ wake_up_idle_cpu(cpu);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+
+/**
+ * mod_timer - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer() is a more efficient way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ *
+ * mod_timer(timer, expires) is equivalent to:
+ *
+ * del_timer(timer); timer->expires = expires; add_timer(timer);
+ *
+ * Note that if there are multiple unserialized concurrent users of the
+ * same timer, then mod_timer() is the only safe way to modify the timeout,
+ * since add_timer() cannot modify an already running timer.
+ *
+ * The function returns whether it has modified a pending timer or not.
+ * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
+ * active timer returns 1.)
+ */
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+ BUG_ON(!timer->function);
+
+ timer_stats_timer_set_start_info(timer);
+ /*
+ * This is a common optimization triggered by the
+ * networking code - if the timer is re-modified
+ * to be the same thing then just return:
+ */
+ if (timer->expires == expires && timer_pending(timer))
+ return 1;
+
+ return __mod_timer(timer, expires);
+}
+
+EXPORT_SYMBOL(mod_timer);
+
+/**
+ * del_timer - deactive a timer.
+ * @timer: the timer to be deactivated
+ *
+ * del_timer() deactivates a timer - this works on both active and inactive
+ * timers.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ */
+int del_timer(struct timer_list *timer)
+{
+ struct tvec_base *base;
+ unsigned long flags;
+ int ret = 0;
+
+ timer_stats_timer_clear_start_info(timer);
+ if (timer_pending(timer)) {
+ base = lock_timer_base(timer, &flags);
+ if (timer_pending(timer)) {
+ detach_timer(timer, 1);
+ ret = 1;
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+ }
+
+ return ret;
+}
+
+EXPORT_SYMBOL(del_timer);
+
+#ifdef CONFIG_SMP
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ struct tvec_base *base;
+ unsigned long flags;
+ int ret = -1;
+
+ base = lock_timer_base(timer, &flags);
+
+ if (base->running_timer == timer)
+ goto out;
+
+ ret = 0;
+ if (timer_pending(timer)) {
+ detach_timer(timer, 1);
+ ret = 1;
+ }
+out:
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(try_to_del_timer_sync);
+
+/**
+ * del_timer_sync - deactivate a timer and wait for the handler to finish.
+ * @timer: the timer to be deactivated
+ *
+ * This function only differs from del_timer() on SMP: besides deactivating
+ * the timer it also makes sure the handler has finished executing on other
+ * CPUs.
+ *
+ * Synchronization rules: Callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts. The caller must not hold locks which would prevent
+ * completion of the timer's handler. The timer's handler must not call
+ * add_timer_on(). Upon exit the timer is not queued and the handler is
+ * not running on any CPU.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ */
+int del_timer_sync(struct timer_list *timer)
+{
+ for (;;) {
+ int ret = try_to_del_timer_sync(timer);
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+
+EXPORT_SYMBOL(del_timer_sync);
+#endif
+
+static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+{
+ /* cascade all the timers from tv up one level */
+ struct timer_list *timer, *tmp;
+ struct list_head tv_list;
+
+ list_replace_init(tv->vec + index, &tv_list);
+
+ /*
+ * We are removing _all_ timers from the list, so we
+ * don't have to detach them individually.
+ */
+ list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+ BUG_ON(tbase_get_base(timer->base) != base);
+ internal_add_timer(base, timer);
+ }
+
+ return index;
+}
+
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ *
+ * This function cascades all vectors and executes all expired timer
+ * vectors.
+ */
+static inline void __run_timers(struct tvec_base *base)
+{
+ struct timer_list *timer;
+
+ spin_lock_irq(&base->lock);
+ while (time_after_eq(jiffies, base->timer_jiffies)) {
+ struct list_head work_list;
+ struct list_head *head = &work_list;
+ int index = base->timer_jiffies & TVR_MASK;
+
+ /*
+ * Cascade timers:
+ */
+ if (!index &&
+ (!cascade(base, &base->tv2, INDEX(0))) &&
+ (!cascade(base, &base->tv3, INDEX(1))) &&
+ !cascade(base, &base->tv4, INDEX(2)))
+ cascade(base, &base->tv5, INDEX(3));
+ ++base->timer_jiffies;
+ list_replace_init(base->tv1.vec + index, &work_list);
+ while (!list_empty(head)) {
+ void (*fn)(unsigned long);
+ unsigned long data;
+
+ timer = list_first_entry(head, struct timer_list,entry);
+ fn = timer->function;
+ data = timer->data;
+
+ timer_stats_account_timer(timer);
+
+ set_running_timer(base, timer);
+ detach_timer(timer, 1);
+ spin_unlock_irq(&base->lock);
+ {
+ int preempt_count = preempt_count();
+ fn(data);
+ if (preempt_count != preempt_count()) {
+ printk(KERN_ERR "huh, entered %p "
+ "with preempt_count %08x, exited"
+ " with %08x?\n",
+ fn, preempt_count,
+ preempt_count());
+ BUG();
+ }
+ }
+ spin_lock_irq(&base->lock);
+ }
+ }
+ set_running_timer(base, NULL);
+ spin_unlock_irq(&base->lock);
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+static unsigned long __next_timer_interrupt(struct tvec_base *base)
+{
+ unsigned long timer_jiffies = base->timer_jiffies;
+ unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+ int index, slot, array, found = 0;
+ struct timer_list *nte;
+ struct tvec *varray[4];
+
+ /* Look for timer events in tv1. */
+ index = slot = timer_jiffies & TVR_MASK;
+ do {
+ list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
+ found = 1;
+ expires = nte->expires;
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ goto cascade;
+ return expires;
+ }
+ slot = (slot + 1) & TVR_MASK;
+ } while (slot != index);
+
+cascade:
+ /* Calculate the next cascade event */
+ if (index)
+ timer_jiffies += TVR_SIZE - index;
+ timer_jiffies >>= TVR_BITS;
+
+ /* Check tv2-tv5. */
+ varray[0] = &base->tv2;
+ varray[1] = &base->tv3;
+ varray[2] = &base->tv4;
+ varray[3] = &base->tv5;
+
+ for (array = 0; array < 4; array++) {
+ struct tvec *varp = varray[array];
+
+ index = slot = timer_jiffies & TVN_MASK;
+ do {
+ list_for_each_entry(nte, varp->vec + slot, entry) {
+ found = 1;
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ }
+ /*
+ * Do we still search for the first timer or are
+ * we looking up the cascade buckets ?
+ */
+ if (found) {
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ break;
+ return expires;
+ }
+ slot = (slot + 1) & TVN_MASK;
+ } while (slot != index);
+
+ if (index)
+ timer_jiffies += TVN_SIZE - index;
+ timer_jiffies >>= TVN_BITS;
+ }
+ return expires;
+}
+
+/*
+ * Check, if the next hrtimer event is before the next timer wheel
+ * event:
+ */
+static unsigned long cmp_next_hrtimer_event(unsigned long now,
+ unsigned long expires)
+{
+ ktime_t hr_delta = hrtimer_get_next_event();
+ struct timespec tsdelta;
+ unsigned long delta;
+
+ if (hr_delta.tv64 == KTIME_MAX)
+ return expires;
+
+ /*
+ * Expired timer available, let it expire in the next tick
+ */
+ if (hr_delta.tv64 <= 0)
+ return now + 1;
+
+ tsdelta = ktime_to_timespec(hr_delta);
+ delta = timespec_to_jiffies(&tsdelta);
+
+ /*
+ * Limit the delta to the max value, which is checked in
+ * tick_nohz_stop_sched_tick():
+ */
+ if (delta > NEXT_TIMER_MAX_DELTA)
+ delta = NEXT_TIMER_MAX_DELTA;
+
+ /*
+ * Take rounding errors in to account and make sure, that it
+ * expires in the next tick. Otherwise we go into an endless
+ * ping pong due to tick_nohz_stop_sched_tick() retriggering
+ * the timer softirq
+ */
+ if (delta < 1)
+ delta = 1;
+ now += delta;
+ if (time_before(now, expires))
+ return now;
+ return expires;
+}
+
+/**
+ * get_next_timer_interrupt - return the jiffy of the next pending timer
+ * @now: current time (in jiffies)
+ */
+unsigned long get_next_timer_interrupt(unsigned long now)
+{
+ struct tvec_base *base = __get_cpu_var(tvec_bases);
+ unsigned long expires;
+
+ spin_lock(&base->lock);
+ expires = __next_timer_interrupt(base);
+ spin_unlock(&base->lock);
+
+ if (time_before_eq(expires, now))
+ return now;
+
+ return cmp_next_hrtimer_event(now, expires);
+}
+#endif
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ cputime_t one_jiffy = jiffies_to_cputime(1);
+
+ if (user_tick) {
+ account_user_time(p, one_jiffy);
+ account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
+ } else {
+ account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
+ account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
+ }
+}
+#endif
+
+/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ if (rcu_pending(cpu))
+ rcu_check_callbacks(cpu, user_tick);
+ printk_tick();
+ scheduler_tick();
+ run_posix_cpu_timers(p);
+}
+
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_tasks(void)
+{
+ return nr_active() * FIXED_1;
+}
+
+/*
+ * Hmm.. Changed this, as the GNU make sources (load.c) seems to
+ * imply that avenrun[] is the standard name for this kind of thing.
+ * Nothing else seems to be standardized: the fractional size etc
+ * all seem to differ on different machines.
+ *
+ * Requires xtime_lock to access.
+ */
+unsigned long avenrun[3];
+
+EXPORT_SYMBOL(avenrun);
+
+/*
+ * calc_load - given tick count, update the avenrun load estimates.
+ * This is called while holding a write_lock on xtime_lock.
+ */
+static inline void calc_load(unsigned long ticks)
+{
+ unsigned long active_tasks; /* fixed-point */
+ static int count = LOAD_FREQ;
+
+ count -= ticks;
+ if (unlikely(count < 0)) {
+ active_tasks = count_active_tasks();
+ do {
+ CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ count += LOAD_FREQ;
+ } while (count < 0);
+ }
+}
+
+/*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+ struct tvec_base *base = __get_cpu_var(tvec_bases);
+
+ hrtimer_run_pending();
+
+ if (time_after_eq(jiffies, base->timer_jiffies))
+ __run_timers(base);
+}
+
+/*
+ * Called by the local, per-CPU timer interrupt on SMP.
+ */
+void run_local_timers(void)
+{
+ hrtimer_run_queues();
+ raise_softirq(TIMER_SOFTIRQ);
+ softlockup_tick();
+}
+
+/*
+ * Called by the timer interrupt. xtime_lock must already be taken
+ * by the timer IRQ!
+ */
+static inline void update_times(unsigned long ticks)
+{
+ update_wall_time();
+ calc_load(ticks);
+}
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+
+void do_timer(unsigned long ticks)
+{
+ jiffies_64 += ticks;
+ update_times(ticks);
+}
+
+#ifdef __ARCH_WANT_SYS_ALARM
+
+/*
+ * For backwards compatibility? This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+SYSCALL_DEFINE1(alarm, unsigned int, seconds)
+{
+ return alarm_setitimer(seconds);
+}
+
+#endif
+
+#ifndef __alpha__
+
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
+ * should be moved into arch/i386 instead?
+ */
+
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid. The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+ return task_tgid_vnr(current);
+}
+
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+ int pid;
+
+ rcu_read_lock();
+ pid = task_tgid_vnr(current->real_parent);
+ rcu_read_unlock();
+
+ return pid;
+}
+
+SYSCALL_DEFINE0(getuid)
+{
+ /* Only we change this so SMP safe */
+ return current->uid;
+}
+
+SYSCALL_DEFINE0(geteuid)
+{
+ /* Only we change this so SMP safe */
+ return current->euid;
+}
+
+SYSCALL_DEFINE0(getgid)
+{
+ /* Only we change this so SMP safe */
+ return current->gid;
+}
+
+SYSCALL_DEFINE0(getegid)
+{
+ /* Only we change this so SMP safe */
+ return current->egid;
+}
+
+#endif
+
+static void process_timeout(unsigned long __data)
+{
+ wake_up_process((struct task_struct *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+signed long __sched schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ switch (timeout)
+ {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable
+ * in the caller. Nothing more. We could take
+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
+ * but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be
+ * 0 since no piece of kernel is supposed to do a check
+ * for a negative retval of schedule_timeout() (since it
+ * should never happens anyway). You just have the printk()
+ * that will tell you if something is gone wrong and where.
+ */
+ if (timeout < 0) {
+ printk(KERN_ERR "schedule_timeout: wrong timeout "
+ "value %lx\n", timeout);
+ dump_stack();
+ current->state = TASK_RUNNING;
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
+ __mod_timer(&timer, expire);
+ schedule();
+ del_singleshot_timer_sync(&timer);
+
+ /* Remove the timer from the object tracker */
+ destroy_timer_on_stack(&timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
+EXPORT_SYMBOL(schedule_timeout);
+
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+signed long __sched schedule_timeout_killable(signed long timeout)
+{
+ __set_current_state(TASK_KILLABLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_killable);
+
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+ return task_pid_vnr(current);
+}
+
+/**
+ * do_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
+ */
+int do_sysinfo(struct sysinfo *info)
+{
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ unsigned long seq;
+
+ memset(info, 0, sizeof(struct sysinfo));
+
+ do {
+ struct timespec tp;
+ seq = read_seqbegin(&xtime_lock);
+
+ /*
+ * This is annoying. The below is the same thing
+ * posix_get_clock_monotonic() does, but it wants to
+ * take the lock which we want to cover the loads stuff
+ * too.
+ */
+
+ getnstimeofday(&tp);
+ tp.tv_sec += wall_to_monotonic.tv_sec;
+ tp.tv_nsec += wall_to_monotonic.tv_nsec;
+ monotonic_to_bootbased(&tp);
+ if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
+ tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
+ tp.tv_sec++;
+ }
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+ info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+ info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+ info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+ info->procs = nr_threads;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ si_meminfo(info);
+ si_swapinfo(info);
+
+ /*
+ * If the sum of all the available memory (i.e. ram + swap)
+ * is less than can be stored in a 32 bit unsigned long then
+ * we can be binary compatible with 2.2.x kernels. If not,
+ * well, in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org>
+ */
+
+ mem_total = info->totalram + info->totalswap;
+ if (mem_total < info->totalram || mem_total < info->totalswap)
+ goto out;
+ bitcount = 0;
+ mem_unit = info->mem_unit;
+ while (mem_unit > 1) {
+ bitcount++;
+ mem_unit >>= 1;
+ sav_total = mem_total;
+ mem_total <<= 1;
+ if (mem_total < sav_total)
+ goto out;
+ }
+
+ /*
+ * If mem_total did not overflow, multiply all memory values by
+ * info->mem_unit and set it to 1. This leaves things compatible
+ * with 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels...
+ */
+
+ info->mem_unit = 1;
+ info->totalram <<= bitcount;
+ info->freeram <<= bitcount;
+ info->sharedram <<= bitcount;
+ info->bufferram <<= bitcount;
+ info->totalswap <<= bitcount;
+ info->freeswap <<= bitcount;
+ info->totalhigh <<= bitcount;
+ info->freehigh <<= bitcount;
+
+out:
+ return 0;
+}
+
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+ struct sysinfo val;
+
+ do_sysinfo(&val);
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __cpuinit init_timers_cpu(int cpu)
+{
+ int j;
+ struct tvec_base *base;
+ static char __cpuinitdata tvec_base_done[NR_CPUS];
+
+ if (!tvec_base_done[cpu]) {
+ static char boot_done;
+
+ if (boot_done) {
+ /*
+ * The APs use this path later in boot
+ */
+ base = kmalloc_node(sizeof(*base),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
+ if (!base)
+ return -ENOMEM;
+
+ /* Make sure that tvec_base is 2 byte aligned */
+ if (tbase_get_deferrable(base)) {
+ WARN_ON(1);
+ kfree(base);
+ return -ENOMEM;
+ }
+ per_cpu(tvec_bases, cpu) = base;
+ } else {
+ /*
+ * This is for the boot CPU - we use compile-time
+ * static initialisation because per-cpu memory isn't
+ * ready yet and because the memory allocators are not
+ * initialised either.
+ */
+ boot_done = 1;
+ base = &boot_tvec_bases;
+ }
+ tvec_base_done[cpu] = 1;
+ } else {
+ base = per_cpu(tvec_bases, cpu);
+ }
+
+ spin_lock_init(&base->lock);
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD(base->tv5.vec + j);
+ INIT_LIST_HEAD(base->tv4.vec + j);
+ INIT_LIST_HEAD(base->tv3.vec + j);
+ INIT_LIST_HEAD(base->tv2.vec + j);
+ }
+ for (j = 0; j < TVR_SIZE; j++)
+ INIT_LIST_HEAD(base->tv1.vec + j);
+
+ base->timer_jiffies = jiffies;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+{
+ struct timer_list *timer;
+
+ while (!list_empty(head)) {
+ timer = list_first_entry(head, struct timer_list, entry);
+ detach_timer(timer, 0);
+ timer_set_base(timer, new_base);
+ internal_add_timer(new_base, timer);
+ }
+}
+
+static void __cpuinit migrate_timers(int cpu)
+{
+ struct tvec_base *old_base;
+ struct tvec_base *new_base;
+ int i;
+
+ BUG_ON(cpu_online(cpu));
+ old_base = per_cpu(tvec_bases, cpu);
+ new_base = get_cpu_var(tvec_bases);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ BUG_ON(old_base->running_timer);
+
+ for (i = 0; i < TVR_SIZE; i++)
+ migrate_timer_list(new_base, old_base->tv1.vec + i);
+ for (i = 0; i < TVN_SIZE; i++) {
+ migrate_timer_list(new_base, old_base->tv2.vec + i);
+ migrate_timer_list(new_base, old_base->tv3.vec + i);
+ migrate_timer_list(new_base, old_base->tv4.vec + i);
+ migrate_timer_list(new_base, old_base->tv5.vec + i);
+ }
+
+ spin_unlock(&old_base->lock);
+ spin_unlock_irq(&new_base->lock);
+ put_cpu_var(tvec_bases);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ switch(action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (init_timers_cpu(cpu) < 0)
+ return NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ migrate_timers(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata timers_nb = {
+ .notifier_call = timer_cpu_notify,
+};
+
+
+void __init init_timers(void)
+{
+ int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+
+ init_timer_stats();
+
+ BUG_ON(err == NOTIFY_BAD);
+ register_cpu_notifier(&timers_nb);
+ open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
+}
+
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+void msleep(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+ while (timeout)
+ timeout = schedule_timeout_uninterruptible(timeout);
+}
+
+EXPORT_SYMBOL(msleep);
+
+/**
+ * msleep_interruptible - sleep waiting for signals
+ * @msecs: Time in milliseconds to sleep for
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+ while (timeout && !signal_pending(current))
+ timeout = schedule_timeout_interruptible(timeout);
+ return jiffies_to_msecs(timeout);
+}
+
+EXPORT_SYMBOL(msleep_interruptible);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 0000000..33dbefd
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,198 @@
+#
+# Architectures that offer an FUNCTION_TRACER implementation should
+# select HAVE_FUNCTION_TRACER:
+#
+
+config NOP_TRACER
+ bool
+
+config HAVE_FUNCTION_TRACER
+ bool
+
+config HAVE_DYNAMIC_FTRACE
+ bool
+
+config HAVE_FTRACE_MCOUNT_RECORD
+ bool
+
+config TRACER_MAX_TRACE
+ bool
+
+config RING_BUFFER
+ bool
+
+config TRACING
+ bool
+ select DEBUG_FS
+ select RING_BUFFER
+ select STACKTRACE if STACKTRACE_SUPPORT
+ select TRACEPOINTS
+ select NOP_TRACER
+
+menu "Tracers"
+
+config FUNCTION_TRACER
+ bool "Kernel Function Tracer"
+ depends on HAVE_FUNCTION_TRACER
+ depends on DEBUG_KERNEL
+ select FRAME_POINTER
+ select TRACING
+ select CONTEXT_SWITCH_TRACER
+ help
+ Enable the kernel to trace every kernel function. This is done
+ by using a compiler feature to insert a small, 5-byte No-Operation
+ instruction to the beginning of every kernel function, which NOP
+ sequence is then dynamically patched into a tracer call when
+ tracing is enabled by the administrator. If it's runtime disabled
+ (the bootup default), then the overhead of the instructions is very
+ small and not measurable even in micro-benchmarks.
+
+config IRQSOFF_TRACER
+ bool "Interrupts-off Latency Tracer"
+ default n
+ depends on TRACE_IRQFLAGS_SUPPORT
+ depends on GENERIC_TIME
+ depends on DEBUG_KERNEL
+ select TRACE_IRQFLAGS
+ select TRACING
+ select TRACER_MAX_TRACE
+ help
+ This option measures the time spent in irqs-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started
+ via:
+
+ echo 0 > /debugfs/tracing/tracing_max_latency
+
+ (Note that kernel size and overhead increases with this option
+ enabled. This option and the preempt-off timing option can be
+ used together or separately.)
+
+config PREEMPT_TRACER
+ bool "Preemption-off Latency Tracer"
+ default n
+ depends on GENERIC_TIME
+ depends on PREEMPT
+ depends on DEBUG_KERNEL
+ select TRACING
+ select TRACER_MAX_TRACE
+ help
+ This option measures the time spent in preemption off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started
+ via:
+
+ echo 0 > /debugfs/tracing/tracing_max_latency
+
+ (Note that kernel size and overhead increases with this option
+ enabled. This option and the irqs-off timing option can be
+ used together or separately.)
+
+config SYSPROF_TRACER
+ bool "Sysprof Tracer"
+ depends on X86
+ select TRACING
+ help
+ This tracer provides the trace needed by the 'Sysprof' userspace
+ tool.
+
+config SCHED_TRACER
+ bool "Scheduling Latency Tracer"
+ depends on DEBUG_KERNEL
+ select TRACING
+ select CONTEXT_SWITCH_TRACER
+ select TRACER_MAX_TRACE
+ help
+ This tracer tracks the latency of the highest priority task
+ to be scheduled in, starting from the point it has woken up.
+
+config CONTEXT_SWITCH_TRACER
+ bool "Trace process context switches"
+ depends on DEBUG_KERNEL
+ select TRACING
+ select MARKERS
+ help
+ This tracer gets called from the context switch and records
+ all switching of tasks.
+
+config BOOT_TRACER
+ bool "Trace boot initcalls"
+ depends on DEBUG_KERNEL
+ select TRACING
+ select CONTEXT_SWITCH_TRACER
+ help
+ This tracer helps developers to optimize boot times: it records
+ the timings of the initcalls and traces key events and the identity
+ of tasks that can cause boot delays, such as context-switches.
+
+ Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+ produce pretty graphics about boot inefficiencies, giving a visual
+ representation of the delays during initcalls - but the raw
+ /debug/tracing/trace text output is readable too.
+
+ ( Note that tracing self tests can't be enabled if this tracer is
+ selected, because the self-tests are an initcall as well and that
+ would invalidate the boot trace. )
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FUNCTION_TRACER
+ depends on DEBUG_KERNEL
+ select FUNCTION_TRACER
+ select STACKTRACE
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in debugfs/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. Because this logic has to execute in every
+ kernel function, all the time, this option can slow down the
+ kernel measurably and is generally intended for kernel
+ developers only.
+
+ Say N if unsure.
+
+config DYNAMIC_FTRACE
+ bool "enable/disable ftrace tracepoints dynamically"
+ depends on FUNCTION_TRACER
+ depends on HAVE_DYNAMIC_FTRACE
+ depends on DEBUG_KERNEL
+ default y
+ help
+ This option will modify all the calls to ftrace dynamically
+ (will patch them out of the binary image and replaces them
+ with a No-Op instruction) as they are called. A table is
+ created to dynamically enable them again.
+
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
+ has native performance as long as no tracing is active.
+
+ The changes to the code are done by a kernel thread that
+ wakes up once a second and checks to see if any ftrace calls
+ were made. If so, it runs stop_machine (stops all CPUS)
+ and modifies the code to jump over the call to ftrace.
+
+config FTRACE_MCOUNT_RECORD
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_FTRACE_MCOUNT_RECORD
+
+config FTRACE_SELFTEST
+ bool
+
+config FTRACE_STARTUP_TEST
+ bool "Perform a startup test on ftrace"
+ depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
+ select FTRACE_SELFTEST
+ help
+ This option performs a series of startup tests on ftrace. On bootup
+ a series of tests are made to verify that the tracer is
+ functioning properly. It will do tests on all the configured
+ tracers of ftrace.
+
+endmenu
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 0000000..c8228b1
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,28 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FUNCTION_TRACER
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+
+obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_NOP_TRACER) += trace_nop.o
+obj-$(CONFIG_STACK_TRACER) += trace_stack.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 0000000..78db083
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1451 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+
+#include <asm/ftrace.h>
+
+#include "trace.h"
+
+#define FTRACE_WARN_ON(cond) \
+ do { \
+ if (WARN_ON(cond)) \
+ ftrace_kill(); \
+ } while (0)
+
+#define FTRACE_WARN_ON_ONCE(cond) \
+ do { \
+ if (WARN_ON_ONCE(cond)) \
+ ftrace_kill(); \
+ } while (0)
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+ .func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+ struct ftrace_ops *op = ftrace_list;
+
+ /* in case someone actually ports this to alpha! */
+ read_barrier_depends();
+
+ while (op != &ftrace_list_end) {
+ /* silly alpha */
+ read_barrier_depends();
+ op->func(ip, parent_ip);
+ op = op->next;
+ };
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing. There may be lag
+ */
+void clear_ftrace_function(void)
+{
+ ftrace_trace_function = ftrace_stub;
+}
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ ops->next = ftrace_list;
+ /*
+ * We are entering ops into the ftrace_list but another
+ * CPU might be walking that list. We need to make sure
+ * the ops->next pointer is valid before another CPU sees
+ * the ops pointer included into the ftrace_list.
+ */
+ smp_wmb();
+ ftrace_list = ops;
+
+ if (ftrace_enabled) {
+ /*
+ * For one func, simply call it directly.
+ * For more than one func, call the chain.
+ */
+ if (ops->next == &ftrace_list_end)
+ ftrace_trace_function = ops->func;
+ else
+ ftrace_trace_function = ftrace_list_func;
+ }
+
+ spin_unlock(&ftrace_lock);
+
+ return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+ struct ftrace_ops **p;
+ int ret = 0;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ /*
+ * If we are removing the last function, then simply point
+ * to the ftrace_stub.
+ */
+ if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+ ftrace_trace_function = ftrace_stub;
+ ftrace_list = &ftrace_list_end;
+ goto out;
+ }
+
+ for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+ if (*p == ops)
+ break;
+
+ if (*p != ops) {
+ ret = -1;
+ goto out;
+ }
+
+ *p = (*p)->next;
+
+ if (ftrace_enabled) {
+ /* If we only have one func left, then call that directly */
+ if (ftrace_list == &ftrace_list_end ||
+ ftrace_list->next == &ftrace_list_end)
+ ftrace_trace_function = ftrace_list->func;
+ }
+
+ out:
+ spin_unlock(&ftrace_lock);
+
+ return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+/*
+ * Since MCOUNT_ADDR may point to mcount itself, we do not want
+ * to get it confused by reading a reference in the code as we
+ * are parsing on objcopy output of text. Use a variable for
+ * it instead.
+ */
+static unsigned long mcount_addr = MCOUNT_ADDR;
+
+enum {
+ FTRACE_ENABLE_CALLS = (1 << 0),
+ FTRACE_DISABLE_CALLS = (1 << 1),
+ FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
+ FTRACE_ENABLE_MCOUNT = (1 << 3),
+ FTRACE_DISABLE_MCOUNT = (1 << 4),
+};
+
+static int ftrace_filtered;
+
+static LIST_HEAD(ftrace_new_addrs);
+
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+ struct ftrace_page *next;
+ unsigned long index;
+ struct dyn_ftrace records[];
+};
+
+#define ENTRIES_PER_PAGE \
+ ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT 10000
+
+static struct ftrace_page *ftrace_pages_start;
+static struct ftrace_page *ftrace_pages;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+
+#ifdef CONFIG_KPROBES
+
+static int frozen_record_count;
+
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+ if (!(rec->flags & FTRACE_FL_FROZEN)) {
+ rec->flags |= FTRACE_FL_FROZEN;
+ frozen_record_count++;
+ }
+}
+
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_FROZEN) {
+ rec->flags &= ~FTRACE_FL_FROZEN;
+ frozen_record_count--;
+ }
+}
+
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+ return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec) ({ 0; })
+# define unfreeze_record(rec) ({ 0; })
+# define record_frozen(rec) ({ 0; })
+#endif /* CONFIG_KPROBES */
+
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+ rec->ip = (unsigned long)ftrace_free_records;
+ ftrace_free_records = rec;
+ rec->flags |= FTRACE_FL_FREE;
+}
+
+void ftrace_release(void *start, unsigned long size)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ unsigned long s = (unsigned long)start;
+ unsigned long e = s + size;
+ int i;
+
+ if (ftrace_disabled || !start)
+ return;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+
+ if ((rec->ip >= s) && (rec->ip < e))
+ ftrace_free_rec(rec);
+ }
+ }
+ spin_unlock(&ftrace_lock);
+}
+
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+ struct dyn_ftrace *rec;
+
+ /* First check for freed records */
+ if (ftrace_free_records) {
+ rec = ftrace_free_records;
+
+ if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+ FTRACE_WARN_ON_ONCE(1);
+ ftrace_free_records = NULL;
+ return NULL;
+ }
+
+ ftrace_free_records = (void *)rec->ip;
+ memset(rec, 0, sizeof(*rec));
+ return rec;
+ }
+
+ if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+ if (!ftrace_pages->next) {
+ /* allocate another page */
+ ftrace_pages->next =
+ (void *)get_zeroed_page(GFP_KERNEL);
+ if (!ftrace_pages->next)
+ return NULL;
+ }
+ ftrace_pages = ftrace_pages->next;
+ }
+
+ return &ftrace_pages->records[ftrace_pages->index++];
+}
+
+static struct dyn_ftrace *
+ftrace_record_ip(unsigned long ip)
+{
+ struct dyn_ftrace *rec;
+
+ if (!ftrace_enabled || ftrace_disabled)
+ return NULL;
+
+ rec = ftrace_alloc_dyn_node(ip);
+ if (!rec)
+ return NULL;
+
+ rec->ip = ip;
+
+ list_add(&rec->list, &ftrace_new_addrs);
+
+ return rec;
+}
+
+#define FTRACE_ADDR ((long)(ftrace_caller))
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec,
+ unsigned char *nop, int enable)
+{
+ unsigned long ip, fl;
+ unsigned char *call, *old, *new;
+
+ ip = rec->ip;
+
+ /*
+ * If this record is not to be traced and
+ * it is not enabled then do nothing.
+ *
+ * If this record is not to be traced and
+ * it is enabled then disabled it.
+ *
+ */
+ if (rec->flags & FTRACE_FL_NOTRACE) {
+ if (rec->flags & FTRACE_FL_ENABLED)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ else
+ return 0;
+
+ } else if (ftrace_filtered && enable) {
+ /*
+ * Filtering is on:
+ */
+
+ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+ /* Record is filtered and enabled, do nothing */
+ if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
+ return 0;
+
+ /* Record is not filtered and is not enabled do nothing */
+ if (!fl)
+ return 0;
+
+ /* Record is not filtered but enabled, disable it */
+ if (fl == FTRACE_FL_ENABLED)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ else
+ /* Otherwise record is filtered but not enabled, enable it */
+ rec->flags |= FTRACE_FL_ENABLED;
+ } else {
+ /* Disable or not filtered */
+
+ if (enable) {
+ /* if record is enabled, do nothing */
+ if (rec->flags & FTRACE_FL_ENABLED)
+ return 0;
+
+ rec->flags |= FTRACE_FL_ENABLED;
+
+ } else {
+
+ /* if record is not enabled do nothing */
+ if (!(rec->flags & FTRACE_FL_ENABLED))
+ return 0;
+
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ }
+ }
+
+ call = ftrace_call_replace(ip, FTRACE_ADDR);
+
+ if (rec->flags & FTRACE_FL_ENABLED) {
+ old = nop;
+ new = call;
+ } else {
+ old = call;
+ new = nop;
+ }
+
+ return ftrace_modify_code(ip, old, new);
+}
+
+static void ftrace_replace_code(int enable)
+{
+ int i, failed;
+ unsigned char *nop = NULL;
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+
+ nop = ftrace_nop_replace();
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+
+ /* don't modify code that has already faulted */
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+
+ /* ignore updates to this record's mcount site */
+ if (get_kprobe((void *)rec->ip)) {
+ freeze_record(rec);
+ continue;
+ } else {
+ unfreeze_record(rec);
+ }
+
+ failed = __ftrace_replace_code(rec, nop, enable);
+ if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+ rec->flags |= FTRACE_FL_FAILED;
+ if ((system_state == SYSTEM_BOOTING) ||
+ !core_kernel_text(rec->ip)) {
+ ftrace_free_rec(rec);
+ }
+ }
+ }
+ }
+}
+
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+ int i;
+
+ printk(KERN_CONT "%s", fmt);
+
+ for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+ unsigned long ip;
+ unsigned char *nop, *call;
+ int ret;
+
+ ip = rec->ip;
+
+ nop = ftrace_nop_replace();
+ call = ftrace_call_replace(ip, mcount_addr);
+
+ ret = ftrace_modify_code(ip, call, nop);
+ if (ret) {
+ switch (ret) {
+ case -EFAULT:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on modifying ");
+ print_ip_sym(ip);
+ break;
+ case -EINVAL:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace failed to modify ");
+ print_ip_sym(ip);
+ print_ip_ins(" expected: ", call);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" replace: ", nop);
+ printk(KERN_CONT "\n");
+ break;
+ case -EPERM:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on writing ");
+ print_ip_sym(ip);
+ break;
+ default:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on unknown error ");
+ print_ip_sym(ip);
+ }
+
+ rec->flags |= FTRACE_FL_FAILED;
+ return 0;
+ }
+ return 1;
+}
+
+static int __ftrace_modify_code(void *data)
+{
+ int *command = data;
+
+ if (*command & FTRACE_ENABLE_CALLS)
+ ftrace_replace_code(1);
+ else if (*command & FTRACE_DISABLE_CALLS)
+ ftrace_replace_code(0);
+
+ if (*command & FTRACE_UPDATE_TRACE_FUNC)
+ ftrace_update_ftrace_func(ftrace_trace_function);
+
+ return 0;
+}
+
+static void ftrace_run_update_code(int command)
+{
+ stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+static ftrace_func_t saved_ftrace_func;
+static int ftrace_start;
+static DEFINE_MUTEX(ftrace_start_lock);
+
+static void ftrace_startup(void)
+{
+ int command = 0;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_start_lock);
+ ftrace_start++;
+ command |= FTRACE_ENABLE_CALLS;
+
+ if (saved_ftrace_func != ftrace_trace_function) {
+ saved_ftrace_func = ftrace_trace_function;
+ command |= FTRACE_UPDATE_TRACE_FUNC;
+ }
+
+ if (!command || !ftrace_enabled)
+ goto out;
+
+ ftrace_run_update_code(command);
+ out:
+ mutex_unlock(&ftrace_start_lock);
+}
+
+static void ftrace_shutdown(void)
+{
+ int command = 0;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_start_lock);
+ ftrace_start--;
+ if (!ftrace_start)
+ command |= FTRACE_DISABLE_CALLS;
+
+ if (saved_ftrace_func != ftrace_trace_function) {
+ saved_ftrace_func = ftrace_trace_function;
+ command |= FTRACE_UPDATE_TRACE_FUNC;
+ }
+
+ if (!command || !ftrace_enabled)
+ goto out;
+
+ ftrace_run_update_code(command);
+ out:
+ mutex_unlock(&ftrace_start_lock);
+}
+
+static void ftrace_startup_sysctl(void)
+{
+ int command = FTRACE_ENABLE_MCOUNT;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_start_lock);
+ /* Force update next time */
+ saved_ftrace_func = NULL;
+ /* ftrace_start is true if we want ftrace running */
+ if (ftrace_start)
+ command |= FTRACE_ENABLE_CALLS;
+
+ ftrace_run_update_code(command);
+ mutex_unlock(&ftrace_start_lock);
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+ int command = FTRACE_DISABLE_MCOUNT;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_start_lock);
+ /* ftrace_start is true if ftrace is running */
+ if (ftrace_start)
+ command |= FTRACE_DISABLE_CALLS;
+
+ ftrace_run_update_code(command);
+ mutex_unlock(&ftrace_start_lock);
+}
+
+static cycle_t ftrace_update_time;
+static unsigned long ftrace_update_cnt;
+unsigned long ftrace_update_tot_cnt;
+
+static int ftrace_update_code(void)
+{
+ struct dyn_ftrace *p, *t;
+ cycle_t start, stop;
+
+ start = ftrace_now(raw_smp_processor_id());
+ ftrace_update_cnt = 0;
+
+ list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+
+ /* If something went wrong, bail without enabling anything */
+ if (unlikely(ftrace_disabled))
+ return -1;
+
+ list_del_init(&p->list);
+
+ /* convert record (i.e, patch mcount-call with NOP) */
+ if (ftrace_code_disable(p)) {
+ p->flags |= FTRACE_FL_CONVERTED;
+ ftrace_update_cnt++;
+ } else
+ ftrace_free_rec(p);
+ }
+
+ stop = ftrace_now(raw_smp_processor_id());
+ ftrace_update_time = stop - start;
+ ftrace_update_tot_cnt += ftrace_update_cnt;
+
+ return 0;
+}
+
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+{
+ struct ftrace_page *pg;
+ int cnt;
+ int i;
+
+ /* allocate a few pages */
+ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!ftrace_pages_start)
+ return -1;
+
+ /*
+ * Allocate a few more pages.
+ *
+ * TODO: have some parser search vmlinux before
+ * final linking to find all calls to ftrace.
+ * Then we can:
+ * a) know how many pages to allocate.
+ * and/or
+ * b) set up the table then.
+ *
+ * The dynamic code is still necessary for
+ * modules.
+ */
+
+ pg = ftrace_pages = ftrace_pages_start;
+
+ cnt = num_to_init / ENTRIES_PER_PAGE;
+ pr_info("ftrace: allocating %ld entries in %d pages\n",
+ num_to_init, cnt + 1);
+
+ for (i = 0; i < cnt; i++) {
+ pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+ /* If we fail, we'll try later anyway */
+ if (!pg->next)
+ break;
+
+ pg = pg->next;
+ }
+
+ return 0;
+}
+
+enum {
+ FTRACE_ITER_FILTER = (1 << 0),
+ FTRACE_ITER_CONT = (1 << 1),
+ FTRACE_ITER_NOTRACE = (1 << 2),
+ FTRACE_ITER_FAILURES = (1 << 3),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+ loff_t pos;
+ struct ftrace_page *pg;
+ unsigned idx;
+ unsigned flags;
+ unsigned char buffer[FTRACE_BUFF_MAX+1];
+ unsigned buffer_idx;
+ unsigned filtered;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct dyn_ftrace *rec = NULL;
+
+ (*pos)++;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+ retry:
+ if (iter->idx >= iter->pg->index) {
+ if (iter->pg->next) {
+ iter->pg = iter->pg->next;
+ iter->idx = 0;
+ goto retry;
+ }
+ } else {
+ rec = &iter->pg->records[iter->idx++];
+ if ((rec->flags & FTRACE_FL_FREE) ||
+
+ (!(iter->flags & FTRACE_ITER_FAILURES) &&
+ (rec->flags & FTRACE_FL_FAILED)) ||
+
+ ((iter->flags & FTRACE_ITER_FAILURES) &&
+ !(rec->flags & FTRACE_FL_FAILED)) ||
+
+ ((iter->flags & FTRACE_ITER_FILTER) &&
+ !(rec->flags & FTRACE_FL_FILTER)) ||
+
+ ((iter->flags & FTRACE_ITER_NOTRACE) &&
+ !(rec->flags & FTRACE_FL_NOTRACE))) {
+ rec = NULL;
+ goto retry;
+ }
+ }
+ spin_unlock(&ftrace_lock);
+
+ iter->pos = *pos;
+
+ return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l = -1;
+
+ if (*pos > iter->pos)
+ *pos = iter->pos;
+
+ l = *pos;
+ p = t_next(m, p, &l);
+
+ return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct dyn_ftrace *rec = v;
+ char str[KSYM_SYMBOL_LEN];
+ int ret = 0;
+
+ if (!rec)
+ return 0;
+
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+ ret = seq_printf(m, "%s\n", str);
+ if (ret < 0) {
+ iter->pos--;
+ iter->idx--;
+ }
+
+ return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->pos = 0;
+
+ ret = seq_open(file, &show_ftrace_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+
+ m->private = iter;
+ } else {
+ kfree(iter);
+ }
+
+ return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct ftrace_iterator *iter = m->private;
+
+ seq_release(inode, file);
+ kfree(iter);
+
+ return 0;
+}
+
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct ftrace_iterator *iter;
+
+ ret = ftrace_avail_open(inode, file);
+ if (!ret) {
+ m = (struct seq_file *)file->private_data;
+ iter = (struct ftrace_iterator *)m->private;
+ iter->flags = FTRACE_ITER_FAILURES;
+ }
+
+ return ret;
+}
+
+
+static void ftrace_filter_reset(int enable)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+ unsigned i;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+ if (enable)
+ ftrace_filtered = 0;
+ pg = ftrace_pages_start;
+ while (pg) {
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+ rec->flags &= ~type;
+ }
+ pg = pg->next;
+ }
+ spin_unlock(&ftrace_lock);
+}
+
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+ struct ftrace_iterator *iter;
+ int ret = 0;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ mutex_lock(&ftrace_regex_lock);
+ if ((file->f_mode & FMODE_WRITE) &&
+ !(file->f_flags & O_APPEND))
+ ftrace_filter_reset(enable);
+
+ if (file->f_mode & FMODE_READ) {
+ iter->pg = ftrace_pages_start;
+ iter->pos = 0;
+ iter->flags = enable ? FTRACE_ITER_FILTER :
+ FTRACE_ITER_NOTRACE;
+
+ ret = seq_open(file, &show_ftrace_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = iter;
+ } else
+ kfree(iter);
+ } else
+ file->private_data = iter;
+ mutex_unlock(&ftrace_regex_lock);
+
+ return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(inode, file, 0);
+}
+
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ if (file->f_mode & FMODE_READ)
+ return seq_read(file, ubuf, cnt, ppos);
+ else
+ return -EPERM;
+}
+
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, origin);
+ else
+ file->f_pos = ret = 1;
+
+ return ret;
+}
+
+enum {
+ MATCH_FULL,
+ MATCH_FRONT_ONLY,
+ MATCH_MIDDLE_ONLY,
+ MATCH_END_ONLY,
+};
+
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+ char str[KSYM_SYMBOL_LEN];
+ char *search = NULL;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int type = MATCH_FULL;
+ unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+ unsigned i, match = 0, search_len = 0;
+
+ for (i = 0; i < len; i++) {
+ if (buff[i] == '*') {
+ if (!i) {
+ search = buff + i + 1;
+ type = MATCH_END_ONLY;
+ search_len = len - (i + 1);
+ } else {
+ if (type == MATCH_END_ONLY) {
+ type = MATCH_MIDDLE_ONLY;
+ } else {
+ match = i;
+ type = MATCH_FRONT_ONLY;
+ }
+ buff[i] = 0;
+ break;
+ }
+ }
+ }
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+ if (enable)
+ ftrace_filtered = 1;
+ pg = ftrace_pages_start;
+ while (pg) {
+ for (i = 0; i < pg->index; i++) {
+ int matched = 0;
+ char *ptr;
+
+ rec = &pg->records[i];
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ switch (type) {
+ case MATCH_FULL:
+ if (strcmp(str, buff) == 0)
+ matched = 1;
+ break;
+ case MATCH_FRONT_ONLY:
+ if (memcmp(str, buff, match) == 0)
+ matched = 1;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ if (strstr(str, search))
+ matched = 1;
+ break;
+ case MATCH_END_ONLY:
+ ptr = strstr(str, search);
+ if (ptr && (ptr[search_len] == 0))
+ matched = 1;
+ break;
+ }
+ if (matched)
+ rec->flags |= flag;
+ }
+ pg = pg->next;
+ }
+ spin_unlock(&ftrace_lock);
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, int enable)
+{
+ struct ftrace_iterator *iter;
+ char ch;
+ size_t read = 0;
+ ssize_t ret;
+
+ if (!cnt || cnt < 0)
+ return 0;
+
+ mutex_lock(&ftrace_regex_lock);
+
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+ iter = m->private;
+ } else
+ iter = file->private_data;
+
+ if (!*ppos) {
+ iter->flags &= ~FTRACE_ITER_CONT;
+ iter->buffer_idx = 0;
+ }
+
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+
+ if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+ /* skip white space */
+ while (cnt && isspace(ch)) {
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ if (isspace(ch)) {
+ file->f_pos += read;
+ ret = read;
+ goto out;
+ }
+
+ iter->buffer_idx = 0;
+ }
+
+ while (cnt && !isspace(ch)) {
+ if (iter->buffer_idx < FTRACE_BUFF_MAX)
+ iter->buffer[iter->buffer_idx++] = ch;
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ if (isspace(ch)) {
+ iter->filtered++;
+ iter->buffer[iter->buffer_idx] = 0;
+ ftrace_match(iter->buffer, iter->buffer_idx, enable);
+ iter->buffer_idx = 0;
+ } else
+ iter->flags |= FTRACE_ITER_CONT;
+
+
+ file->f_pos += read;
+
+ ret = read;
+ out:
+ mutex_unlock(&ftrace_regex_lock);
+
+ return ret;
+}
+
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_regex_lock);
+ if (reset)
+ ftrace_filter_reset(enable);
+ if (buf)
+ ftrace_match(buf, len, enable);
+ mutex_unlock(&ftrace_regex_lock);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(buf, len, reset, 1);
+}
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(buf, len, reset, 0);
+}
+
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct ftrace_iterator *iter;
+
+ mutex_lock(&ftrace_regex_lock);
+ if (file->f_mode & FMODE_READ) {
+ iter = m->private;
+
+ seq_release(inode, file);
+ } else
+ iter = file->private_data;
+
+ if (iter->buffer_idx) {
+ iter->filtered++;
+ iter->buffer[iter->buffer_idx] = 0;
+ ftrace_match(iter->buffer, iter->buffer_idx, enable);
+ }
+
+ mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_start_lock);
+ if (ftrace_start && ftrace_enabled)
+ ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ mutex_unlock(&ftrace_start_lock);
+ mutex_unlock(&ftrace_sysctl_lock);
+
+ kfree(iter);
+ mutex_unlock(&ftrace_regex_lock);
+ return 0;
+}
+
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_release(inode, file, 0);
+}
+
+static struct file_operations ftrace_avail_fops = {
+ .open = ftrace_avail_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_failures_fops = {
+ .open = ftrace_failures_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+ .open = ftrace_filter_open,
+ .read = ftrace_regex_read,
+ .write = ftrace_filter_write,
+ .llseek = ftrace_regex_lseek,
+ .release = ftrace_filter_release,
+};
+
+static struct file_operations ftrace_notrace_fops = {
+ .open = ftrace_notrace_open,
+ .read = ftrace_regex_read,
+ .write = ftrace_notrace_write,
+ .llseek = ftrace_regex_lseek,
+ .release = ftrace_notrace_release,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("available_filter_functions", 0444,
+ d_tracer, NULL, &ftrace_avail_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'available_filter_functions' entry\n");
+
+ entry = debugfs_create_file("failures", 0444,
+ d_tracer, NULL, &ftrace_failures_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'failures' entry\n");
+
+ entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+ NULL, &ftrace_filter_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'set_ftrace_filter' entry\n");
+
+ entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+ NULL, &ftrace_notrace_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'set_ftrace_notrace' entry\n");
+
+ return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
+static int ftrace_convert_nops(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *p;
+ unsigned long addr;
+ unsigned long flags;
+
+ mutex_lock(&ftrace_start_lock);
+ p = start;
+ while (p < end) {
+ addr = ftrace_call_adjust(*p++);
+ ftrace_record_ip(addr);
+ }
+
+ /* disable interrupts to prevent kstop machine */
+ local_irq_save(flags);
+ ftrace_update_code();
+ local_irq_restore(flags);
+ mutex_unlock(&ftrace_start_lock);
+
+ return 0;
+}
+
+void ftrace_init_module(unsigned long *start, unsigned long *end)
+{
+ if (ftrace_disabled || start == end)
+ return;
+ ftrace_convert_nops(start, end);
+}
+
+extern unsigned long __start_mcount_loc[];
+extern unsigned long __stop_mcount_loc[];
+
+void __init ftrace_init(void)
+{
+ unsigned long count, addr, flags;
+ int ret;
+
+ /* Keep the ftrace pointer to the stub */
+ addr = (unsigned long)ftrace_stub;
+
+ local_irq_save(flags);
+ ftrace_dyn_arch_init(&addr);
+ local_irq_restore(flags);
+
+ /* ftrace_dyn_arch_init places the return code in addr */
+ if (addr)
+ goto failed;
+
+ count = __stop_mcount_loc - __start_mcount_loc;
+
+ ret = ftrace_dyn_table_alloc(count);
+ if (ret)
+ goto failed;
+
+ last_ftrace_enabled = ftrace_enabled = 1;
+
+ ret = ftrace_convert_nops(__start_mcount_loc,
+ __stop_mcount_loc);
+
+ return;
+ failed:
+ ftrace_disabled = 1;
+}
+
+#else
+
+static int __init ftrace_nodyn_init(void)
+{
+ ftrace_enabled = 1;
+ return 0;
+}
+device_initcall(ftrace_nodyn_init);
+
+# define ftrace_startup() do { } while (0)
+# define ftrace_shutdown() do { } while (0)
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/**
+ * ftrace_kill - kill ftrace
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill(void)
+{
+ ftrace_disabled = 1;
+ ftrace_enabled = 0;
+ clear_ftrace_function();
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ * with "notrace", otherwise it will go into a
+ * recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -1;
+
+ mutex_lock(&ftrace_sysctl_lock);
+ ret = __register_ftrace_function(ops);
+ ftrace_startup();
+ mutex_unlock(&ftrace_sysctl_lock);
+
+ return ret;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+ int ret;
+
+ mutex_lock(&ftrace_sysctl_lock);
+ ret = __unregister_ftrace_function(ops);
+ ftrace_shutdown();
+ mutex_unlock(&ftrace_sysctl_lock);
+
+ return ret;
+}
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ mutex_lock(&ftrace_sysctl_lock);
+
+ ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+ if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+ goto out;
+
+ last_ftrace_enabled = ftrace_enabled;
+
+ if (ftrace_enabled) {
+
+ ftrace_startup_sysctl();
+
+ /* we are starting ftrace again */
+ if (ftrace_list != &ftrace_list_end) {
+ if (ftrace_list->next == &ftrace_list_end)
+ ftrace_trace_function = ftrace_list->func;
+ else
+ ftrace_trace_function = ftrace_list_func;
+ }
+
+ } else {
+ /* stopping ftrace calls (just send to ftrace_stub) */
+ ftrace_trace_function = ftrace_stub;
+
+ ftrace_shutdown_sysctl();
+ }
+
+ out:
+ mutex_unlock(&ftrace_sysctl_lock);
+ return ret;
+}
+
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 0000000..71202ac
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2201 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+/* Global flag to disable all recording to ring buffers */
+static int ring_buffers_off __read_mostly;
+
+/**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+ ring_buffers_off = 0;
+}
+
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ ring_buffers_off = 1;
+}
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ u64 time;
+
+ preempt_disable_notrace();
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ time = sched_clock() << DEBUG_SHIFT;
+ preempt_enable_notrace();
+
+ return time;
+}
+
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA 28
+
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/* inline for ring buffer fast paths */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return RB_LEN_TIME_EXTEND;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RINGBUF_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ return rb_event_length(event);
+}
+
+/* inline for ring buffer fast paths */
+static inline void *
+rb_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ return rb_event_data(event);
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ u64 time_stamp; /* page time stamp */
+ local_t write; /* index for next write */
+ local_t commit; /* write commited index */
+ unsigned read; /* index for next read */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
+};
+
+/*
+ * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
+ * this issue out.
+ */
+static inline void free_buffer_page(struct buffer_page *bpage)
+{
+ if (bpage->page)
+ free_page((unsigned long)bpage->page);
+ kfree(bpage);
+}
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ struct buffer_page *head_page; /* read from head */
+ struct buffer_page *tail_page; /* write to tail */
+ struct buffer_page *commit_page; /* commited pages */
+ struct buffer_page *reader_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define RB_WARN_ON(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_RET(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_ONCE(buffer, cond) \
+ do { \
+ static int once; \
+ if (unlikely(cond) && !once) { \
+ once++; \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
+ RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.next->prev != &page->list);
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.prev->next != &page->list);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page->page = (void *)addr;
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
+ unsigned long addr;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ spin_lock_init(&cpu_buffer->lock);
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_reader;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ return cpu_buffer;
+
+ fail_free_reader:
+ free_buffer_page(cpu_buffer->reader_page);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ list_del_init(&cpu_buffer->reader_page->list);
+ free_buffer_page(cpu_buffer->reader_page);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/*
+ * Causes compile errors if the struct buffer_page gets bigger
+ * than the struct page.
+ */
+extern int ring_buffer_page_too_big(void);
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* Paranoid! Optimizes out when all is well */
+ if (sizeof(struct buffer_page) > sizeof(struct page))
+ ring_buffer_page_too_big();
+
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ list_add_tail(&page->list, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ unsigned long buffer_size;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ int i, cpu;
+
+ /*
+ * Always succeed at resizing a non-existent buffer:
+ */
+ if (!buffer)
+ return size;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page->page = (void *)addr;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ mutex_unlock(&buffer->mutex);
+ return -ENOMEM;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RINGBUF_TYPE_PADDING;
+}
+
+static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
+{
+ return page->page + index;
+}
+
+static inline struct ring_buffer_event *
+rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return __rb_page_index(cpu_buffer->reader_page,
+ cpu_buffer->reader_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return __rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return __rb_page_index(iter->head_page, iter->head);
+}
+
+static inline unsigned rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write);
+}
+
+static inline unsigned rb_page_commit(struct buffer_page *bpage)
+{
+ return local_read(&bpage->commit);
+}
+
+/* Size is determined by what has been commited */
+static inline unsigned rb_page_size(struct buffer_page *bpage)
+{
+ return rb_page_commit(bpage);
+}
+
+static inline unsigned
+rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_commit(cpu_buffer->commit_page);
+}
+
+static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_commit(cpu_buffer->head_page);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += rb_event_length(event)) {
+
+ event = __rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
+ /* Only count data entries */
+ if (event->type != RINGBUF_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline unsigned
+rb_event_index(struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+
+ return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+}
+
+static inline int
+rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static inline void
+rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ while (cpu_buffer->commit_page->page != (void *)addr) {
+ RB_WARN_ON(cpu_buffer,
+ cpu_buffer->commit_page == cpu_buffer->tail_page);
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+ }
+
+ /* Now set the commit to the event's index */
+ local_set(&cpu_buffer->commit_page->commit, index);
+}
+
+static inline void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ again:
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+ cpu_buffer->commit_page->commit =
+ cpu_buffer->commit_page->write;
+ barrier();
+ }
+
+ /* again, keep gcc from optimizing */
+ barrier();
+
+ /*
+ * If an interrupt came in just after the first while loop
+ * and pushed the tail page forward, we will be left with
+ * a dangling commit that will never go forward.
+ */
+ if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ goto again;
+}
+
+static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+ cpu_buffer->reader_page->read = 0;
+}
+
+static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /*
+ * The iterator could be on the reader page (it starts there).
+ * But the head could have moved, since the reader was
+ * found. Check for this case and assign the iterator
+ * to the head page instead of next.
+ */
+ if (iter->head_page == cpu_buffer->reader_page)
+ iter->head_page = cpu_buffer->head_page;
+ else
+ rb_inc_page(cpu_buffer, &iter->head_page);
+
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RINGBUF_TYPE_PADDING:
+ break;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ event->len =
+ (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
+ unsigned long tail, write;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned long flags;
+
+ commit_page = cpu_buffer->commit_page;
+ /* we just need to protect against interrupts */
+ barrier();
+ tail_page = cpu_buffer->tail_page;
+ write = local_add_return(length, &tail_page->write);
+ tail = write - length;
+
+ /* See if we shot pass the end of this buffer page */
+ if (write > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ head_page = cpu_buffer->head_page;
+ reader_page = cpu_buffer->reader_page;
+
+ /* we grabbed the lock before incrementing */
+ RB_WARN_ON(cpu_buffer, next_page == reader_page);
+
+ /*
+ * If for some reason, we had an interrupt storm that made
+ * it all the way around the buffer, bail, and warn
+ * about it.
+ */
+ if (unlikely(next_page == commit_page)) {
+ WARN_ON_ONCE(1);
+ goto out_unlock;
+ }
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE)) {
+ /* reset write */
+ if (tail <= BUF_PAGE_SIZE)
+ local_set(&tail_page->write, tail);
+ goto out_unlock;
+ }
+
+ /* tail_page has not moved yet? */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* count overflows */
+ rb_update_overflow(cpu_buffer);
+
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ cpu_buffer->head_page->read = 0;
+ }
+ }
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ local_set(&next_page->write, 0);
+ local_set(&next_page->commit, 0);
+ cpu_buffer->tail_page = next_page;
+
+ /* reread the time stamp */
+ *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+ cpu_buffer->tail_page->time_stamp = *ts;
+ }
+
+ /*
+ * The actual tail page has moved forward.
+ */
+ if (tail < BUF_PAGE_SIZE) {
+ /* Mark the rest of the page with padding */
+ event = __rb_page_index(tail_page, tail);
+ event->type = RINGBUF_TYPE_PADDING;
+ }
+
+ if (tail <= BUF_PAGE_SIZE)
+ /* Set the write back to the previous setting */
+ local_set(&tail_page->write, tail);
+
+ /*
+ * If this was a commit entry that failed,
+ * increment that too
+ */
+ if (tail_page == cpu_buffer->commit_page &&
+ tail == rb_commit_index(cpu_buffer)) {
+ rb_set_commit_to_write(cpu_buffer);
+ }
+
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ /* fail and let the caller try again */
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We reserved something on the buffer */
+
+ BUG_ON(write > BUF_PAGE_SIZE);
+
+ event = __rb_page_index(tail_page, tail);
+ rb_update_event(event, type, length);
+
+ /*
+ * If this is a commit and the tail is zero, then update
+ * this page's time stamp.
+ */
+ if (!tail && rb_is_commit(cpu_buffer, event))
+ cpu_buffer->commit_page->time_stamp = *ts;
+
+ return event;
+
+ out_unlock:
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+ return NULL;
+}
+
+static int
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ u64 *ts, u64 *delta)
+{
+ struct ring_buffer_event *event;
+ static int once;
+ int ret;
+
+ if (unlikely(*delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ (unsigned long long)*delta,
+ (unsigned long long)*ts,
+ (unsigned long long)cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RINGBUF_TYPE_TIME_EXTEND,
+ RB_LEN_TIME_EXTEND,
+ ts);
+ if (!event)
+ return -EBUSY;
+
+ if (PTR_ERR(event) == -EAGAIN)
+ return -EAGAIN;
+
+ /* Only a commited time event can update the write stamp */
+ if (rb_is_commit(cpu_buffer, event)) {
+ /*
+ * If this is the first on the page, then we need to
+ * update the page itself, and just put in a zero.
+ */
+ if (rb_event_index(event)) {
+ event->time_delta = *delta & TS_MASK;
+ event->array[0] = *delta >> TS_SHIFT;
+ } else {
+ cpu_buffer->commit_page->time_stamp = *ts;
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+ cpu_buffer->write_stamp = *ts;
+ /* let the caller know this was the commit */
+ ret = 1;
+ } else {
+ /* Darn, this is just wasted space */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ ret = 0;
+ }
+
+ *delta = 0;
+
+ return ret;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int commit = 0;
+ int nr_loops = 0;
+
+ again:
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (unlikely(++nr_loops > 1000)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ /*
+ * Only the first commit can update the timestamp.
+ * Yes there is a race here. If an interrupt comes in
+ * just after the conditional and it traces too, then it
+ * will also check the deltas. More than one timestamp may
+ * also be made. But only the entry that did the actual
+ * commit will be something other than zero.
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
+ rb_page_write(cpu_buffer->tail_page) ==
+ rb_commit_index(cpu_buffer)) {
+
+ delta = ts - cpu_buffer->write_stamp;
+
+ /* make sure this delta is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (unlikely(ts < cpu_buffer->write_stamp))
+ delta = 0;
+
+ if (test_time_stamp(delta)) {
+
+ commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+
+ if (commit == -EBUSY)
+ return NULL;
+
+ if (commit == -EAGAIN)
+ goto again;
+
+ RB_WARN_ON(cpu_buffer, commit < 0);
+ }
+ } else
+ /* Non commits have zero deltas */
+ delta = 0;
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (PTR_ERR(event) == -EAGAIN)
+ goto again;
+
+ if (!event) {
+ if (unlikely(commit))
+ /*
+ * Ouch! We needed a timestamp and it was commited. But
+ * we didn't get our event reserved.
+ */
+ rb_set_commit_to_write(cpu_buffer);
+ return NULL;
+ }
+
+ /*
+ * If the timestamp was commited, make the commit our entry
+ * now so that we will update it when needed.
+ */
+ if (commit)
+ rb_set_commit_event(cpu_buffer, event);
+ else if (!rb_is_commit(cpu_buffer, event))
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+static DEFINE_PER_CPU(int, rb_need_resched);
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu, resched;
+
+ if (ring_buffers_off)
+ return NULL;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ /* If we are tracing schedule, we don't want to recurse */
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ if (!event)
+ goto out;
+
+ /*
+ * Need to store resched state on this cpu.
+ * Only the first needs to.
+ */
+
+ if (preempt_count() == 1)
+ per_cpu(rb_need_resched, cpu) = resched;
+
+ return event;
+
+ out:
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+ return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->entries++;
+
+ /* Only process further if we own the commit */
+ if (!rb_is_commit(cpu_buffer, event))
+ return;
+
+ cpu_buffer->write_stamp += event->time_delta;
+
+ rb_set_commit_to_write(cpu_buffer);
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ /*
+ * Only the last preempt count needs to restore preemption.
+ */
+ if (preempt_count() == 1) {
+ if (per_cpu(rb_need_resched, cpu))
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+ } else
+ preempt_enable_no_resched_notrace();
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length;
+ void *body;
+ int ret = -EBUSY;
+ int cpu, resched;
+
+ if (ring_buffers_off)
+ return -EBUSY;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = rb_reserve_next_event(cpu_buffer,
+ RINGBUF_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = rb_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = cpu_buffer->reader_page;
+ struct buffer_page *head = cpu_buffer->head_page;
+ struct buffer_page *commit = cpu_buffer->commit_page;
+
+ return reader->read == rb_page_commit(reader) &&
+ (commit == reader ||
+ (commit == head &&
+ head->read == rb_page_commit(commit)));
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /* Iterator usage is expected to have record disabled */
+ if (list_empty(&cpu_buffer->reader_page->list)) {
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head_page->read;
+ } else {
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+ }
+ if (iter->head)
+ iter->read_stamp = cpu_buffer->read_stamp;
+ else
+ iter->read_stamp = iter->head_page->time_stamp;
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->commit_page &&
+ iter->head == rb_commit_index(cpu_buffer);
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = NULL;
+ unsigned long flags;
+ int nr_loops = 0;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ again:
+ /*
+ * This should normally only loop twice. But because the
+ * start of the reader inserts an empty page, it causes
+ * a case where we will loop three times. There should be no
+ * reason to loop four times (that I know of).
+ */
+ if (unlikely(++nr_loops > 3)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ reader = NULL;
+ goto out;
+ }
+
+ reader = cpu_buffer->reader_page;
+
+ /* If there's more to read, return this page */
+ if (cpu_buffer->reader_page->read < rb_page_size(reader))
+ goto out;
+
+ /* Never should we have an index greater than the size */
+ RB_WARN_ON(cpu_buffer,
+ cpu_buffer->reader_page->read > rb_page_size(reader));
+
+ /* check if we caught up to the tail */
+ reader = NULL;
+ if (cpu_buffer->commit_page == cpu_buffer->reader_page)
+ goto out;
+
+ /*
+ * Splice the empty reader page into the list around the head.
+ * Reset the reader page to size zero.
+ */
+
+ reader = cpu_buffer->head_page;
+ cpu_buffer->reader_page->list.next = reader->list.next;
+ cpu_buffer->reader_page->list.prev = reader->list.prev;
+
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->commit, 0);
+
+ /* Make the reader page now replace the head */
+ reader->list.prev->next = &cpu_buffer->reader_page->list;
+ reader->list.next->prev = &cpu_buffer->reader_page->list;
+
+ /*
+ * If the tail is on the reader, then we must set the head
+ * to the inserted page, otherwise we set it one before.
+ */
+ cpu_buffer->head_page = cpu_buffer->reader_page;
+
+ if (cpu_buffer->commit_page != reader)
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+
+ /* Finally update the reader page to the new head */
+ cpu_buffer->reader_page = reader;
+ rb_reset_reader_page(cpu_buffer);
+
+ goto again;
+
+ out:
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ return reader;
+}
+
+static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+ unsigned length;
+
+ reader = rb_get_reader_page(cpu_buffer);
+
+ /* This function should not be called when buffer is empty */
+ BUG_ON(!reader);
+
+ event = rb_reader_event(cpu_buffer);
+
+ if (event->type == RINGBUF_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ length = rb_event_length(event);
+ cpu_buffer->reader_page->read += length;
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= rb_page_size(iter->head_page)) {
+ BUG_ON(iter->head_page == cpu_buffer->commit_page);
+ rb_inc_iter(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = rb_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
+ (iter->head + length > rb_commit_index(cpu_buffer)));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= rb_page_size(iter->head_page)) &&
+ (iter->head_page != cpu_buffer->commit_page))
+ rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+ int nr_loops = 0;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (!reader)
+ return NULL;
+
+ event = rb_reader_event(cpu_buffer);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ RB_WARN_ON(cpu_buffer, 1);
+ rb_advance_reader(cpu_buffer);
+ return NULL;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int nr_loops = 0;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ rb_inc_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ rb_advance_reader(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+ ring_buffer_iter_reset(iter);
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ rb_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ local_set(&cpu_buffer->head_page->write, 0);
+ local_set(&cpu_buffer->head_page->commit, 0);
+
+ cpu_buffer->head_page->read = 0;
+
+ cpu_buffer->tail_page = cpu_buffer->head_page;
+ cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->commit, 0);
+ cpu_buffer->reader_page->read = 0;
+
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ rb_reset_cpu(cpu_buffer);
+
+ spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_reset_cpu(buffer, cpu);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!rb_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 1;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ if (!cpu_isset(cpu, buffer_a->cpumask) ||
+ !cpu_isset(cpu, buffer_b->cpumask))
+ return -EINVAL;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int *p = filp->private_data;
+ char buf[64];
+ int r;
+
+ /* !ring_buffers_off == tracing_on */
+ r = sprintf(buf, "%d\n", !*p);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int *p = filp->private_data;
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ /* !ring_buffers_off == tracing_on */
+ *p = !val;
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+};
+
+
+static __init int rb_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+ &ring_buffers_off, &rb_simple_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'tracing_on' entry\n");
+
+ return 0;
+}
+
+fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 0000000..d86e325
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3235 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ * Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/kprobes.h>
+#include <linux/writeback.h>
+
+#include <linux/stacktrace.h>
+#include <linux/ring_buffer.h>
+#include <linux/irqflags.h>
+
+#include "trace.h"
+
+#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
+
+unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly tracing_thresh;
+
+static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+
+static inline void ftrace_disable_cpu(void)
+{
+ preempt_disable();
+ local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+}
+
+static inline void ftrace_enable_cpu(void)
+{
+ local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+ preempt_enable();
+}
+
+static cpumask_t __read_mostly tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu) \
+ for_each_cpu_mask(cpu, tracing_buffer_mask)
+
+static int tracing_disabled = 1;
+
+long
+ns2usecs(cycle_t nsec)
+{
+ nsec += 500;
+ do_div(nsec, 1000);
+ return nsec;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+ u64 ts = ring_buffer_time_stamp(cpu);
+ ring_buffer_normalize_time_stamp(cpu, &ts);
+ return ts;
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+/* tracer_enabled is used to toggle activation of a tracer */
+static int tracer_enabled = 1;
+
+/* function tracing enabled */
+int ftrace_function_enabled;
+
+/*
+ * trace_buf_size is the size in bytes that is allocated
+ * for a buffer. Note, the number of bytes is always rounded
+ * to page size.
+ *
+ * This number is purposely set to a low number of 16384.
+ * If the dump on oops happens, it will be much appreciated
+ * to not have to wait for all that output. Anyway this can be
+ * boot time and run time configurable.
+ */
+#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
+
+static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer *trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
+static struct tracer *current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int max_tracer_type_len;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+ /*
+ * The runqueue_is_locked() can fail, but this is the best we
+ * have for now:
+ */
+ if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+ wake_up(&trace_wait);
+}
+
+static int __init set_buf_size(char *str)
+{
+ unsigned long buf_size;
+ int ret;
+
+ if (!str)
+ return 0;
+ ret = strict_strtoul(str, 0, &buf_size);
+ /* nr_entries can not be zero */
+ if (ret < 0 || buf_size == 0)
+ return 0;
+ trace_buf_size = buf_size;
+ return 1;
+}
+__setup("trace_buf_size=", set_buf_size);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+ return nsecs / 1000;
+}
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+ "print-parent",
+ "sym-offset",
+ "sym-addr",
+ "verbose",
+ "raw",
+ "hex",
+ "bin",
+ "block",
+ "stacktrace",
+ "sched-tree",
+ "ftrace_printk",
+ NULL
+};
+
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static raw_spinlock_t ftrace_max_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct trace_array_cpu *data = tr->data[cpu];
+
+ max_tr.cpu = cpu;
+ max_tr.time_start = data->preempt_timestamp;
+
+ data = max_tr.data[cpu];
+ data->saved_latency = tracing_max_latency;
+
+ memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+ data->pid = tsk->pid;
+ data->uid = tsk->uid;
+ data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ data->policy = tsk->policy;
+ data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(current);
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ va_list ap;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ int len = strlen(str);
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+
+#define MAX_MEMHEX_BYTES 8
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+ unsigned char hex[HEX_CHARS];
+ unsigned char *data = mem;
+ int i, j;
+
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < len; i++) {
+#else
+ for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ hex[j++] = ' ';
+
+ return trace_seq_putmem(s, hex, j);
+}
+
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+ s->len = 0;
+ s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+ int len;
+ int ret;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret)
+ return -EFAULT;
+
+ s->readpos += len;
+ return cnt;
+}
+
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+ s->buffer[len] = 0;
+ seq_puts(m, s->buffer);
+
+ trace_seq_reset(s);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct ring_buffer *buf = tr->buffer;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ __raw_spin_lock(&ftrace_max_lock);
+
+ tr->buffer = max_tr.buffer;
+ max_tr.buffer = buf;
+
+ ftrace_disable_cpu();
+ ring_buffer_reset(tr->buffer);
+ ftrace_enable_cpu();
+
+ __update_max_tr(tr, tsk, cpu);
+ __raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ int ret;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ __raw_spin_lock(&ftrace_max_lock);
+
+ ftrace_disable_cpu();
+
+ ring_buffer_reset(max_tr.buffer);
+ ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+
+ ftrace_enable_cpu();
+
+ WARN_ON_ONCE(ret);
+
+ __update_max_tr(tr, tsk, cpu);
+ __raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+ struct tracer *t;
+ int len;
+ int ret = 0;
+
+ if (!type->name) {
+ pr_info("Tracer must have a name\n");
+ return -1;
+ }
+
+ mutex_lock(&trace_types_lock);
+ for (t = trace_types; t; t = t->next) {
+ if (strcmp(type->name, t->name) == 0) {
+ /* already found */
+ pr_info("Trace %s already registered\n",
+ type->name);
+ ret = -1;
+ goto out;
+ }
+ }
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ if (type->selftest) {
+ struct tracer *saved_tracer = current_trace;
+ struct trace_array *tr = &global_trace;
+ int saved_ctrl = tr->ctrl;
+ int i;
+ /*
+ * Run a selftest on this tracer.
+ * Here we reset the trace buffer, and set the current
+ * tracer to be this tracer. The tracer can then run some
+ * internal tracing to verify that everything is in order.
+ * If we fail, we do not register this tracer.
+ */
+ for_each_tracing_cpu(i) {
+ tracing_reset(tr, i);
+ }
+ current_trace = type;
+ tr->ctrl = 0;
+ /* the test is responsible for initializing and enabling */
+ pr_info("Testing tracer %s: ", type->name);
+ ret = type->selftest(type, tr);
+ /* the test is responsible for resetting too */
+ current_trace = saved_tracer;
+ tr->ctrl = saved_ctrl;
+ if (ret) {
+ printk(KERN_CONT "FAILED!\n");
+ goto out;
+ }
+ /* Only reset on passing, to avoid touching corrupted buffers */
+ for_each_tracing_cpu(i) {
+ tracing_reset(tr, i);
+ }
+ printk(KERN_CONT "PASSED\n");
+ }
+#endif
+
+ type->next = trace_types;
+ trace_types = type;
+ len = strlen(type->name);
+ if (len > max_tracer_type_len)
+ max_tracer_type_len = len;
+
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+ struct tracer **t;
+ int len;
+
+ mutex_lock(&trace_types_lock);
+ for (t = &trace_types; *t; t = &(*t)->next) {
+ if (*t == type)
+ goto found;
+ }
+ pr_info("Trace %s not registered\n", type->name);
+ goto out;
+
+ found:
+ *t = (*t)->next;
+ if (strlen(type->name) != max_tracer_type_len)
+ goto out;
+
+ max_tracer_type_len = 0;
+ for (t = &trace_types; *t; t = &(*t)->next) {
+ len = strlen((*t)->name);
+ if (len > max_tracer_type_len)
+ max_tracer_type_len = len;
+ }
+ out:
+ mutex_unlock(&trace_types_lock);
+}
+
+void tracing_reset(struct trace_array *tr, int cpu)
+{
+ ftrace_disable_cpu();
+ ring_buffer_reset_cpu(tr->buffer, cpu);
+ ftrace_enable_cpu();
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static void trace_init_cmdlines(void)
+{
+ memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+ memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+ cmdline_idx = 0;
+}
+
+void trace_stop_cmdline_recording(void);
+
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+ unsigned map;
+ unsigned idx;
+
+ if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+ return;
+
+ /*
+ * It's not the end of the world if we don't get
+ * the lock, but we also don't want to spin
+ * nor do we want to disable interrupts,
+ * so if we miss here, then better luck next time.
+ */
+ if (!spin_trylock(&trace_cmdline_lock))
+ return;
+
+ idx = map_pid_to_cmdline[tsk->pid];
+ if (idx >= SAVED_CMDLINES) {
+ idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+ map = map_cmdline_to_pid[idx];
+ if (map <= PID_MAX_DEFAULT)
+ map_pid_to_cmdline[map] = (unsigned)-1;
+
+ map_pid_to_cmdline[tsk->pid] = idx;
+
+ cmdline_idx = idx;
+ }
+
+ memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+ spin_unlock(&trace_cmdline_lock);
+}
+
+static char *trace_find_cmdline(int pid)
+{
+ char *cmdline = "<...>";
+ unsigned map;
+
+ if (!pid)
+ return "<idle>";
+
+ if (pid > PID_MAX_DEFAULT)
+ goto out;
+
+ map = map_pid_to_cmdline[pid];
+ if (map >= SAVED_CMDLINES)
+ goto out;
+
+ cmdline = saved_cmdlines[map];
+
+ out:
+ return cmdline;
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+ if (atomic_read(&trace_record_cmdline_disabled))
+ return;
+
+ trace_save_cmdline(tsk);
+}
+
+void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
+ int pc)
+{
+ struct task_struct *tsk = current;
+
+ entry->preempt_count = pc & 0xff;
+ entry->pid = (tsk) ? tsk->pid : 0;
+ entry->flags =
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+ (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#else
+ TRACE_FLAG_IRQS_NOSUPPORT |
+#endif
+ ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+ ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
+{
+ struct ring_buffer_event *event;
+ struct ftrace_entry *entry;
+ unsigned long irq_flags;
+
+ /* If we are reading the ring buffer, don't trace */
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_FN;
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+}
+
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
+{
+ if (likely(!atomic_read(&data->disabled)))
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+}
+
+static void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip, int pc)
+{
+#ifdef CONFIG_STACKTRACE
+ struct ring_buffer_event *event;
+ struct stack_entry *entry;
+ struct stack_trace trace;
+ unsigned long irq_flags;
+
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_STACK;
+
+ memset(&entry->caller, 0, sizeof(entry->caller));
+
+ trace.nr_entries = 0;
+ trace.max_entries = FTRACE_STACK_ENTRIES;
+ trace.skip = skip;
+ trace.entries = entry->caller;
+
+ save_stack_trace(&trace);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
+}
+
+void __trace_stack(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long flags,
+ int skip)
+{
+ ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+}
+
+static void
+ftrace_trace_special(void *__tr, void *__data,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3,
+ int pc)
+{
+ struct ring_buffer_event *event;
+ struct trace_array_cpu *data = __data;
+ struct trace_array *tr = __tr;
+ struct special_entry *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, pc);
+ entry->ent.type = TRACE_SPECIAL;
+ entry->arg1 = arg1;
+ entry->arg2 = arg2;
+ entry->arg3 = arg3;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+
+ trace_wake_up();
+}
+
+void
+__trace_special(void *__tr, void *__data,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
+}
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_CTX;
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, flags, 5, pc);
+}
+
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_WAKE;
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ ftrace_trace_stack(tr, data, flags, 6, pc);
+
+ trace_wake_up();
+}
+
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ int cpu;
+ int pc;
+
+ if (tracing_disabled || !tr->ctrl)
+ return;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+
+ if (likely(!atomic_read(&data->disabled)))
+ ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
+
+ preempt_enable_notrace();
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu, resched;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ pc = preempt_count();
+ resched = need_resched();
+ preempt_disable_notrace();
+ local_save_flags(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1))
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+
+ atomic_dec(&data->disabled);
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+ register_ftrace_function(&trace_ops);
+ if (tracer_enabled)
+ ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+ unregister_ftrace_function(&trace_ops);
+}
+#endif
+
+enum trace_file_type {
+ TRACE_FILE_LAT_FMT = 1,
+};
+
+static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
+{
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
+
+ iter->idx++;
+ if (iter->buffer_iter[iter->cpu])
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+
+ ftrace_enable_cpu();
+}
+
+static struct trace_entry *
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
+
+ if (buf_iter)
+ event = ring_buffer_iter_peek(buf_iter, ts);
+ else
+ event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+
+ ftrace_enable_cpu();
+
+ return event ? ring_buffer_event_data(event) : NULL;
+}
+
+static struct trace_entry *
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+{
+ struct ring_buffer *buffer = iter->tr->buffer;
+ struct trace_entry *ent, *next = NULL;
+ u64 next_ts = 0, ts;
+ int next_cpu = -1;
+ int cpu;
+
+ for_each_tracing_cpu(cpu) {
+
+ if (ring_buffer_empty_cpu(buffer, cpu))
+ continue;
+
+ ent = peek_next_entry(iter, cpu, &ts);
+
+ /*
+ * Pick the entry with the smallest timestamp:
+ */
+ if (ent && (!next || ts < next_ts)) {
+ next = ent;
+ next_cpu = cpu;
+ next_ts = ts;
+ }
+ }
+
+ if (ent_cpu)
+ *ent_cpu = next_cpu;
+
+ if (ent_ts)
+ *ent_ts = next_ts;
+
+ return next;
+}
+
+/* Find the next real entry, without updating the iterator itself */
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+{
+ return __find_next_entry(iter, ent_cpu, ent_ts);
+}
+
+/* Find the next real entry, and increment the iterator to the next entry */
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+ iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+
+ if (iter->ent)
+ trace_iterator_increment(iter, iter->cpu);
+
+ return iter->ent ? iter : NULL;
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+ /* Don't allow ftrace to trace into the ring buffers */
+ ftrace_disable_cpu();
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+ ftrace_enable_cpu();
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_iterator *iter = m->private;
+ int i = (int)*pos;
+ void *ent;
+
+ (*pos)++;
+
+ /* can't go backwards */
+ if (iter->idx > i)
+ return NULL;
+
+ if (iter->idx < 0)
+ ent = find_next_entry_inc(iter);
+ else
+ ent = iter;
+
+ while (ent && iter->idx < i)
+ ent = find_next_entry_inc(iter);
+
+ iter->pos = *pos;
+
+ return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l = 0;
+ int cpu;
+
+ mutex_lock(&trace_types_lock);
+
+ if (!current_trace || current_trace != iter->trace) {
+ mutex_unlock(&trace_types_lock);
+ return NULL;
+ }
+
+ atomic_inc(&trace_record_cmdline_disabled);
+
+ /* let the tracer grab locks here if needed */
+ if (current_trace->start)
+ current_trace->start(iter);
+
+ if (*pos != iter->pos) {
+ iter->ent = NULL;
+ iter->cpu = 0;
+ iter->idx = -1;
+
+ ftrace_disable_cpu();
+
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_iter_reset(iter->buffer_iter[cpu]);
+ }
+
+ ftrace_enable_cpu();
+
+ for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+ ;
+
+ } else {
+ l = *pos - 1;
+ p = s_next(m, p, &l);
+ }
+
+ return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ struct trace_iterator *iter = m->private;
+
+ atomic_dec(&trace_record_cmdline_disabled);
+
+ /* let the tracer release locks here if needed */
+ if (current_trace && current_trace == iter->trace && iter->trace->stop)
+ iter->trace->stop(iter);
+
+ mutex_unlock(&trace_types_lock);
+}
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+ return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+ unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ sprint_symbol(str, address);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+ int ret;
+
+ if (!ip)
+ return trace_seq_printf(s, "0");
+
+ if (sym_flags & TRACE_ITER_SYM_OFFSET)
+ ret = seq_print_sym_offset(s, "%s", ip);
+ else
+ ret = seq_print_sym_short(s, "%s", ip);
+
+ if (!ret)
+ return 0;
+
+ if (sym_flags & TRACE_ITER_SYM_ADDR)
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+ seq_puts(m, "# _------=> CPU# \n");
+ seq_puts(m, "# / _-----=> irqs-off \n");
+ seq_puts(m, "# | / _----=> need-resched \n");
+ seq_puts(m, "# || / _---=> hardirq/softirq \n");
+ seq_puts(m, "# ||| / _--=> preempt-depth \n");
+ seq_puts(m, "# |||| / \n");
+ seq_puts(m, "# ||||| delay \n");
+ seq_puts(m, "# cmd pid ||||| time | caller \n");
+ seq_puts(m, "# \\ / ||||| \\ | / \n");
+}
+
+static void print_func_help_header(struct seq_file *m)
+{
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |\n");
+}
+
+
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_array *tr = iter->tr;
+ struct trace_array_cpu *data = tr->data[tr->cpu];
+ struct tracer *type = current_trace;
+ unsigned long total;
+ unsigned long entries;
+ const char *name = "preemption";
+
+ if (type)
+ name = type->name;
+
+ entries = ring_buffer_entries(iter->tr->buffer);
+ total = entries +
+ ring_buffer_overruns(iter->tr->buffer);
+
+ seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+ name, UTS_RELEASE);
+ seq_puts(m, "-----------------------------------"
+ "---------------------------------\n");
+ seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+ " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+ nsecs_to_usecs(data->saved_latency),
+ entries,
+ total,
+ tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+ "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+ "desktop",
+#elif defined(CONFIG_PREEMPT)
+ "preempt",
+#else
+ "unknown",
+#endif
+ /* These are reserved for later use */
+ 0, 0, 0, 0);
+#ifdef CONFIG_SMP
+ seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+ seq_puts(m, ")\n");
+#endif
+ seq_puts(m, " -----------------\n");
+ seq_printf(m, " | task: %.16s-%d "
+ "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+ data->comm, data->pid, data->uid, data->nice,
+ data->policy, data->rt_priority);
+ seq_puts(m, " -----------------\n");
+
+ if (data->critical_start) {
+ seq_puts(m, " => started at: ");
+ seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+ trace_print_seq(m, &iter->seq);
+ seq_puts(m, "\n => ended at: ");
+ seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+ trace_print_seq(m, &iter->seq);
+ seq_puts(m, "\n");
+ }
+
+ seq_puts(m, "\n");
+}
+
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+ int hardirq, softirq;
+ char *comm;
+
+ comm = trace_find_cmdline(entry->pid);
+
+ trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+ trace_seq_printf(s, "%3d", cpu);
+ trace_seq_printf(s, "%c%c",
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
+ ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ if (hardirq && softirq) {
+ trace_seq_putc(s, 'H');
+ } else {
+ if (hardirq) {
+ trace_seq_putc(s, 'h');
+ } else {
+ if (softirq)
+ trace_seq_putc(s, 's');
+ else
+ trace_seq_putc(s, '.');
+ }
+ }
+
+ if (entry->preempt_count)
+ trace_seq_printf(s, "%x", entry->preempt_count);
+ else
+ trace_seq_puts(s, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+ unsigned long rel_usecs)
+{
+ trace_seq_printf(s, " %4lldus", abs_usecs);
+ if (rel_usecs > preempt_mark_thresh)
+ trace_seq_puts(s, "!: ");
+ else if (rel_usecs > 1)
+ trace_seq_puts(s, "+: ");
+ else
+ trace_seq_puts(s, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+/*
+ * The message is supposed to contain an ending newline.
+ * If the printing stops prematurely, try to add a newline of our own.
+ */
+void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
+{
+ struct trace_entry *ent;
+ struct trace_field_cont *cont;
+ bool ok = true;
+
+ ent = peek_next_entry(iter, iter->cpu, NULL);
+ if (!ent || ent->type != TRACE_CONT) {
+ trace_seq_putc(s, '\n');
+ return;
+ }
+
+ do {
+ cont = (struct trace_field_cont *)ent;
+ if (ok)
+ ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
+
+ ftrace_disable_cpu();
+
+ if (iter->buffer_iter[iter->cpu])
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+ else
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+
+ ftrace_enable_cpu();
+
+ ent = peek_next_entry(iter, iter->cpu, NULL);
+ } while (ent && ent->type == TRACE_CONT);
+
+ if (!ok)
+ trace_seq_putc(s, '\n');
+}
+
+static enum print_line_t
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+ struct trace_seq *s = &iter->seq;
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_entry *next_entry;
+ unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+ struct trace_entry *entry = iter->ent;
+ unsigned long abs_usecs;
+ unsigned long rel_usecs;
+ u64 next_ts;
+ char *comm;
+ int S, T;
+ int i;
+ unsigned state;
+
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ next_entry = find_next_entry(iter, NULL, &next_ts);
+ if (!next_entry)
+ next_ts = iter->ts;
+ rel_usecs = ns2usecs(next_ts - iter->ts);
+ abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
+
+ if (verbose) {
+ comm = trace_find_cmdline(entry->pid);
+ trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
+ " %ld.%03ldms (+%ld.%03ldms): ",
+ comm,
+ entry->pid, cpu, entry->flags,
+ entry->preempt_count, trace_idx,
+ ns2usecs(iter->ts),
+ abs_usecs/1000,
+ abs_usecs % 1000, rel_usecs/1000,
+ rel_usecs % 1000);
+ } else {
+ lat_print_generic(s, entry, cpu);
+ lat_print_timestamp(s, abs_usecs, rel_usecs);
+ }
+ switch (entry->type) {
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
+ trace_seq_puts(s, " (");
+ seq_print_ip_sym(s, field->parent_ip, sym_flags);
+ trace_seq_puts(s, ")\n");
+ break;
+ }
+ case TRACE_CTX:
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+
+ state = field->prev_state ?
+ __ffs(field->prev_state) + 1 : 0;
+ S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+ comm = trace_find_cmdline(field->next_pid);
+ trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, entry->type == TRACE_CTX ? "==>" : " +",
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+ break;
+ }
+ case TRACE_SPECIAL: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3);
+ break;
+ }
+ case TRACE_STACK: {
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ if (i)
+ trace_seq_puts(s, " <= ");
+ seq_print_ip_sym(s, field->caller[i], sym_flags);
+ }
+ trace_seq_puts(s, "\n");
+ break;
+ }
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
+ trace_seq_printf(s, ": %s", field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
+ default:
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
+ }
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_entry *entry;
+ unsigned long usec_rem;
+ unsigned long long t;
+ unsigned long secs;
+ char *comm;
+ int ret;
+ int S, T;
+ int i;
+
+ entry = iter->ent;
+
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ comm = trace_find_cmdline(iter->ent->pid);
+
+ t = ns2usecs(iter->ts);
+ usec_rem = do_div(t, 1000000ULL);
+ secs = (unsigned long)t;
+
+ ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ switch (entry->type) {
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = seq_print_ip_sym(s, field->ip, sym_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+ field->parent_ip) {
+ ret = trace_seq_printf(s, " <-");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = seq_print_ip_sym(s,
+ field->parent_ip,
+ sym_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ ret = trace_seq_printf(s, "\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_CTX:
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+ ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ entry->type == TRACE_CTX ? "==>" : " +",
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_SPECIAL: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_STACK: {
+ struct stack_entry *field;
+
+ trace_assign_type(field, entry);
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ if (i) {
+ ret = trace_seq_puts(s, " <= ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ ret = seq_print_ip_sym(s, field->caller[i],
+ sym_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ ret = trace_seq_puts(s, "\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, sym_flags);
+ trace_seq_printf(s, ": %s", field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
+ }
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry;
+ int ret;
+ int S, T;
+
+ entry = iter->ent;
+
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ ret = trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ switch (entry->type) {
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "%x %x\n",
+ field->ip,
+ field->parent_ip);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_CTX:
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+ if (entry->type == TRACE_WAKE)
+ S = '+';
+ ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_SPECIAL:
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ break;
+ }
+ case TRACE_PRINT: {
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+ break;
+ }
+ }
+ return TRACE_TYPE_HANDLED;
+}
+
+#define SEQ_PUT_FIELD_RET(s, x) \
+do { \
+ if (!trace_seq_putmem(s, &(x), sizeof(x))) \
+ return 0; \
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x) \
+do { \
+ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
+ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
+ return 0; \
+} while (0)
+
+static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ unsigned char newline = '\n';
+ struct trace_entry *entry;
+ int S, T;
+
+ entry = iter->ent;
+
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+
+ switch (entry->type) {
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+ SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+ break;
+ }
+ case TRACE_CTX:
+ case TRACE_WAKE: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ S = field->prev_state < sizeof(state_to_char) ?
+ state_to_char[field->prev_state] : 'X';
+ T = field->next_state < sizeof(state_to_char) ?
+ state_to_char[field->next_state] : 'X';
+ if (entry->type == TRACE_WAKE)
+ S = '+';
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, S);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, T);
+ break;
+ }
+ case TRACE_SPECIAL:
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+ break;
+ }
+ }
+ SEQ_PUT_FIELD_RET(s, newline);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry;
+
+ entry = iter->ent;
+
+ if (entry->type == TRACE_CONT)
+ return TRACE_TYPE_HANDLED;
+
+ SEQ_PUT_FIELD_RET(s, entry->pid);
+ SEQ_PUT_FIELD_RET(s, entry->cpu);
+ SEQ_PUT_FIELD_RET(s, iter->ts);
+
+ switch (entry->type) {
+ case TRACE_FN: {
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->ip);
+ SEQ_PUT_FIELD_RET(s, field->parent_ip);
+ break;
+ }
+ case TRACE_CTX: {
+ struct ctx_switch_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_FIELD_RET(s, field->prev_state);
+ SEQ_PUT_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_FIELD_RET(s, field->next_state);
+ break;
+ }
+ case TRACE_SPECIAL:
+ case TRACE_STACK: {
+ struct special_entry *field;
+
+ trace_assign_type(field, entry);
+
+ SEQ_PUT_FIELD_RET(s, field->arg1);
+ SEQ_PUT_FIELD_RET(s, field->arg2);
+ SEQ_PUT_FIELD_RET(s, field->arg3);
+ break;
+ }
+ }
+ return 1;
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+ int cpu;
+
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu]) {
+ if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ return 0;
+ } else {
+ if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static enum print_line_t print_trace_line(struct trace_iterator *iter)
+{
+ enum print_line_t ret;
+
+ if (iter->trace && iter->trace->print_line) {
+ ret = iter->trace->print_line(iter);
+ if (ret != TRACE_TYPE_UNHANDLED)
+ return ret;
+ }
+
+ if (trace_flags & TRACE_ITER_BIN)
+ return print_bin_fmt(iter);
+
+ if (trace_flags & TRACE_ITER_HEX)
+ return print_hex_fmt(iter);
+
+ if (trace_flags & TRACE_ITER_RAW)
+ return print_raw_fmt(iter);
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+ return print_trace_fmt(iter);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+ struct trace_iterator *iter = v;
+
+ if (iter->ent == NULL) {
+ if (iter->tr) {
+ seq_printf(m, "# tracer: %s\n", iter->trace->name);
+ seq_puts(m, "#\n");
+ }
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return 0;
+ print_trace_header(m, iter);
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+ } else {
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_func_help_header(m);
+ }
+ } else {
+ print_trace_line(iter);
+ trace_print_seq(m, &iter->seq);
+ }
+
+ return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int cpu;
+
+ if (tracing_disabled) {
+ *ret = -ENODEV;
+ return NULL;
+ }
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ *ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&trace_types_lock);
+ if (current_trace && current_trace->print_max)
+ iter->tr = &max_tr;
+ else
+ iter->tr = inode->i_private;
+ iter->trace = current_trace;
+ iter->pos = -1;
+
+ for_each_tracing_cpu(cpu) {
+
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_start(iter->tr->buffer, cpu);
+
+ if (!iter->buffer_iter[cpu])
+ goto fail_buffer;
+ }
+
+ /* TODO stop tracer */
+ *ret = seq_open(file, &tracer_seq_ops);
+ if (*ret)
+ goto fail_buffer;
+
+ m = file->private_data;
+ m->private = iter;
+
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl) {
+ tracer_enabled = 0;
+ ftrace_function_enabled = 0;
+ }
+
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
+ mutex_unlock(&trace_types_lock);
+
+ out:
+ return iter;
+
+ fail_buffer:
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+ mutex_unlock(&trace_types_lock);
+ kfree(iter);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct trace_iterator *iter = m->private;
+ int cpu;
+
+ mutex_lock(&trace_types_lock);
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+
+ if (iter->trace && iter->trace->close)
+ iter->trace->close(iter);
+
+ /* reenable tracing if it was previously enabled */
+ if (iter->tr->ctrl) {
+ tracer_enabled = 1;
+ /*
+ * It is safe to enable function tracing even if it
+ * isn't used
+ */
+ ftrace_function_enabled = 1;
+ }
+ mutex_unlock(&trace_types_lock);
+
+ seq_release(inode, file);
+ kfree(iter);
+ return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ __tracing_open(inode, file, &ret);
+
+ return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+ struct trace_iterator *iter;
+ int ret;
+
+ iter = __tracing_open(inode, file, &ret);
+
+ if (!ret)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+ return ret;
+}
+
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct tracer *t = m->private;
+
+ (*pos)++;
+
+ if (t)
+ t = t->next;
+
+ m->private = t;
+
+ return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct tracer *t = m->private;
+ loff_t l = 0;
+
+ mutex_lock(&trace_types_lock);
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct tracer *t = v;
+
+ if (!t)
+ return 0;
+
+ seq_printf(m, "%s", t->name);
+ if (t->next)
+ seq_putc(m, ' ');
+ else
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ ret = seq_open(file, &show_traces_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = trace_types;
+ }
+
+ return ret;
+}
+
+static struct file_operations tracing_fops = {
+ .open = tracing_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+ .open = tracing_lt_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+ .open = show_traces_open,
+ .read = seq_read,
+ .release = seq_release,
+};
+
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int len;
+
+ mutex_lock(&tracing_cpumask_update_lock);
+
+ len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+ if (count - len < 2) {
+ count = -EINVAL;
+ goto out_err;
+ }
+ len += sprintf(mask_str + len, "\n");
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+ mutex_unlock(&tracing_cpumask_update_lock);
+
+ return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int err, cpu;
+
+ mutex_lock(&tracing_cpumask_update_lock);
+ err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ if (err)
+ goto err_unlock;
+
+ raw_local_irq_disable();
+ __raw_spin_lock(&ftrace_max_lock);
+ for_each_tracing_cpu(cpu) {
+ /*
+ * Increase/decrease the disabled counter if we are
+ * about to flip a bit in the cpumask:
+ */
+ if (cpu_isset(cpu, tracing_cpumask) &&
+ !cpu_isset(cpu, tracing_cpumask_new)) {
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ }
+ if (!cpu_isset(cpu, tracing_cpumask) &&
+ cpu_isset(cpu, tracing_cpumask_new)) {
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ }
+ }
+ __raw_spin_unlock(&ftrace_max_lock);
+ raw_local_irq_enable();
+
+ tracing_cpumask = tracing_cpumask_new;
+
+ mutex_unlock(&tracing_cpumask_update_lock);
+
+ return count;
+
+err_unlock:
+ mutex_unlock(&tracing_cpumask_update_lock);
+
+ return err;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_cpumask_read,
+ .write = tracing_cpumask_write,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *buf;
+ int r = 0;
+ int len = 0;
+ int i;
+
+ /* calulate max size */
+ for (i = 0; trace_options[i]; i++) {
+ len += strlen(trace_options[i]);
+ len += 3; /* "no" and space */
+ }
+
+ /* +2 for \n and \0 */
+ buf = kmalloc(len + 2, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = 0; trace_options[i]; i++) {
+ if (trace_flags & (1 << i))
+ r += sprintf(buf + r, "%s ", trace_options[i]);
+ else
+ r += sprintf(buf + r, "no%s ", trace_options[i]);
+ }
+
+ r += sprintf(buf + r, "\n");
+ WARN_ON(r >= len + 2);
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+ kfree(buf);
+
+ return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp = buf;
+ int neg = 0;
+ int i;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "no", 2) == 0) {
+ neg = 1;
+ cmp += 2;
+ }
+
+ for (i = 0; trace_options[i]; i++) {
+ int len = strlen(trace_options[i]);
+
+ if (strncmp(cmp, trace_options[i], len) == 0) {
+ if (neg)
+ trace_flags &= ~(1 << i);
+ else
+ trace_flags |= (1 << i);
+ break;
+ }
+ }
+ /*
+ * If no option could be set, return an error:
+ */
+ if (!trace_options[i])
+ return -EINVAL;
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_iter_ctrl_read,
+ .write = tracing_iter_ctrl_write,
+};
+
+static const char readme_msg[] =
+ "tracing mini-HOWTO:\n\n"
+ "# mkdir /debug\n"
+ "# mount -t debugfs nodev /debug\n\n"
+ "# cat /debug/tracing/available_tracers\n"
+ "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+ "# cat /debug/tracing/current_tracer\n"
+ "none\n"
+ "# echo sched_switch > /debug/tracing/current_tracer\n"
+ "# cat /debug/tracing/current_tracer\n"
+ "sched_switch\n"
+ "# cat /debug/tracing/iter_ctrl\n"
+ "noprint-parent nosym-offset nosym-addr noverbose\n"
+ "# echo print-parent > /debug/tracing/iter_ctrl\n"
+ "# echo 1 > /debug/tracing/tracing_enabled\n"
+ "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+ "echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_readme_read,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%ld\n", tr->ctrl);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ val = !!val;
+
+ mutex_lock(&trace_types_lock);
+ if (tr->ctrl ^ val) {
+ if (val)
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ tr->ctrl = val;
+
+ if (current_trace && current_trace->ctrl_update)
+ current_trace->ctrl_update(tr);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[max_tracer_type_len+2];
+ int r;
+
+ mutex_lock(&trace_types_lock);
+ if (current_trace)
+ r = sprintf(buf, "%s\n", current_trace->name);
+ else
+ r = sprintf(buf, "\n");
+ mutex_unlock(&trace_types_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *t;
+ char buf[max_tracer_type_len+1];
+ int i;
+ size_t ret;
+
+ ret = cnt;
+
+ if (cnt > max_tracer_type_len)
+ cnt = max_tracer_type_len;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ /* strip ending whitespace. */
+ for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+ buf[i] = 0;
+
+ mutex_lock(&trace_types_lock);
+ for (t = trace_types; t; t = t->next) {
+ if (strcmp(t->name, buf) == 0)
+ break;
+ }
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (t == current_trace)
+ goto out;
+
+ if (current_trace && current_trace->reset)
+ current_trace->reset(tr);
+
+ current_trace = t;
+ if (t->init)
+ t->init(tr);
+
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ if (ret > 0)
+ filp->f_pos += ret;
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long *ptr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%ld\n",
+ *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+ if (r > sizeof(buf))
+ r = sizeof(buf);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long *ptr = filp->private_data;
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ *ptr = val * 1000;
+
+ return cnt;
+}
+
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+ struct trace_iterator *iter;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ /* We only allow for reader of the pipe */
+ if (atomic_inc_return(&tracing_reader) != 1) {
+ atomic_dec(&tracing_reader);
+ return -EBUSY;
+ }
+
+ /* create a buffer to store the information to pass to userspace */
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ mutex_lock(&trace_types_lock);
+ iter->tr = &global_trace;
+ iter->trace = current_trace;
+ filp->private_data = iter;
+
+ if (iter->trace->pipe_open)
+ iter->trace->pipe_open(iter);
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+ struct trace_iterator *iter = file->private_data;
+
+ kfree(iter);
+ atomic_dec(&tracing_reader);
+
+ return 0;
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+ struct trace_iterator *iter = filp->private_data;
+
+ if (trace_flags & TRACE_ITER_BLOCK) {
+ /*
+ * Always select as readable when in blocking mode
+ */
+ return POLLIN | POLLRDNORM;
+ } else {
+ if (!trace_empty(iter))
+ return POLLIN | POLLRDNORM;
+ poll_wait(filp, &trace_wait, poll_table);
+ if (!trace_empty(iter))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+ }
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_iterator *iter = filp->private_data;
+ ssize_t sret;
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ return sret;
+
+ trace_seq_reset(&iter->seq);
+
+ mutex_lock(&trace_types_lock);
+ if (iter->trace->read) {
+ sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+ if (sret)
+ goto out;
+ }
+
+waitagain:
+ sret = 0;
+ while (trace_empty(iter)) {
+
+ if ((filp->f_flags & O_NONBLOCK)) {
+ sret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * This is a make-shift waitqueue. The reason we don't use
+ * an actual wait queue is because:
+ * 1) we only ever have one waiter
+ * 2) the tracing, traces all functions, we don't want
+ * the overhead of calling wake_up and friends
+ * (and tracing them too)
+ * Anyway, this is really very primitive wakeup.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ iter->tr->waiter = current;
+
+ mutex_unlock(&trace_types_lock);
+
+ /* sleep for 100 msecs, and try again. */
+ schedule_timeout(HZ/10);
+
+ mutex_lock(&trace_types_lock);
+
+ iter->tr->waiter = NULL;
+
+ if (signal_pending(current)) {
+ sret = -EINTR;
+ goto out;
+ }
+
+ if (iter->trace != current_trace)
+ goto out;
+
+ /*
+ * We block until we read something and tracing is disabled.
+ * We still block if tracing is disabled, but we have never
+ * read anything. This allows a user to cat this file, and
+ * then enable tracing. But after we have read something,
+ * we give an EOF when tracing is again disabled.
+ *
+ * iter->pos will be 0 if we haven't read anything.
+ */
+ if (!tracer_enabled && iter->pos)
+ break;
+
+ continue;
+ }
+
+ /* stop when tracing is finished */
+ if (trace_empty(iter))
+ goto out;
+
+ if (cnt >= PAGE_SIZE)
+ cnt = PAGE_SIZE - 1;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter->seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter->pos = -1;
+
+ while (find_next_entry_inc(iter) != NULL) {
+ enum print_line_t ret;
+ int len = iter->seq.len;
+
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ /* don't print partial lines */
+ iter->seq.len = len;
+ break;
+ }
+
+ trace_consume(iter);
+
+ if (iter->seq.len >= cnt)
+ break;
+ }
+
+ /* Now copy what we have to the user */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (iter->seq.readpos >= iter->seq.len)
+ trace_seq_reset(&iter->seq);
+
+ /*
+ * If there was nothing to send to user, inspite of consuming trace
+ * entries, go back to wait for more entries.
+ */
+ if (sret == -EBUSY)
+ goto waitagain;
+
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return sret;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%lu\n", tr->entries);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ char buf[64];
+ int ret, cpu;
+ struct trace_array *tr = filp->private_data;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ /* must have at least 1 entry */
+ if (!val)
+ return -EINVAL;
+
+ mutex_lock(&trace_types_lock);
+
+ if (tr->ctrl) {
+ cnt = -EBUSY;
+ pr_info("ftrace: please disable tracing"
+ " before modifying buffer size\n");
+ goto out;
+ }
+
+ /* disable all cpu buffers */
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_inc(&max_tr.data[cpu]->disabled);
+ }
+
+ if (val != global_trace.entries) {
+ ret = ring_buffer_resize(global_trace.buffer, val);
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+
+ ret = ring_buffer_resize(max_tr.buffer, val);
+ if (ret < 0) {
+ int r;
+ cnt = ret;
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.entries);
+ if (r < 0) {
+ /* AARGH! We are left with different
+ * size max buffer!!!! */
+ WARN_ON(1);
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+
+ global_trace.entries = val;
+ }
+
+ filp->f_pos += cnt;
+
+ /* If check pages failed, return ENOMEM */
+ if (tracing_disabled)
+ cnt = -ENOMEM;
+ out:
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_dec(&max_tr.data[cpu]->disabled);
+ }
+
+ max_tr.entries = global_trace.entries;
+ mutex_unlock(&trace_types_lock);
+
+ return cnt;
+}
+
+static int mark_printk(const char *fmt, ...)
+{
+ int ret;
+ va_list args;
+ va_start(args, fmt);
+ ret = trace_vprintk(0, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+static ssize_t
+tracing_mark_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ char *buf;
+ char *end;
+ struct trace_array *tr = &global_trace;
+
+ if (!tr->ctrl || tracing_disabled)
+ return -EINVAL;
+
+ if (cnt > TRACE_BUF_SIZE)
+ cnt = TRACE_BUF_SIZE;
+
+ buf = kmalloc(cnt + 1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ kfree(buf);
+ return -EFAULT;
+ }
+
+ /* Cut from the first nil or newline. */
+ buf[cnt] = '\0';
+ end = strchr(buf, '\n');
+ if (end)
+ *end = '\0';
+
+ cnt = mark_printk("%s\n", buf);
+ kfree(buf);
+ *fpos += cnt;
+
+ return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_max_lat_read,
+ .write = tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_ctrl_read,
+ .write = tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_set_trace_read,
+ .write = tracing_set_trace_write,
+};
+
+static struct file_operations tracing_pipe_fops = {
+ .open = tracing_open_pipe,
+ .poll = tracing_poll_pipe,
+ .read = tracing_read_pipe,
+ .release = tracing_release_pipe,
+};
+
+static struct file_operations tracing_entries_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_entries_read,
+ .write = tracing_entries_write,
+};
+
+static struct file_operations tracing_mark_fops = {
+ .open = tracing_open_generic,
+ .write = tracing_mark_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long *p = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%ld\n", *p);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+ static int once;
+
+ if (d_tracer)
+ return d_tracer;
+
+ d_tracer = debugfs_create_dir("tracing", NULL);
+
+ if (!d_tracer && !once) {
+ once = 1;
+ pr_warning("Could not create debugfs directory 'tracing'\n");
+ return NULL;
+ }
+
+ return d_tracer;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+static __init int tracer_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+ &global_trace, &tracing_ctrl_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+ entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+ NULL, &tracing_iter_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+ entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+ NULL, &tracing_cpumask_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
+ entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+ &global_trace, &tracing_lt_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+ entry = debugfs_create_file("trace", 0444, d_tracer,
+ &global_trace, &tracing_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'trace' entry\n");
+
+ entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+ &global_trace, &show_traces_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'available_tracers' entry\n");
+
+ entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+ &global_trace, &set_tracer_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'current_tracer' entry\n");
+
+ entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+ &tracing_max_latency,
+ &tracing_max_lat_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'tracing_max_latency' entry\n");
+
+ entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+ &tracing_thresh, &tracing_max_lat_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'tracing_thresh' entry\n");
+ entry = debugfs_create_file("README", 0644, d_tracer,
+ NULL, &tracing_readme_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'README' entry\n");
+
+ entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+ NULL, &tracing_pipe_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'trace_pipe' entry\n");
+
+ entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+ &global_trace, &tracing_entries_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'trace_entries' entry\n");
+
+ entry = debugfs_create_file("trace_marker", 0220, d_tracer,
+ NULL, &tracing_mark_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'trace_marker' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ &ftrace_update_tot_cnt,
+ &tracing_read_long_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'dyn_ftrace_total_info' entry\n");
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+ init_tracer_sysprof_debugfs(d_tracer);
+#endif
+ return 0;
+}
+
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ static DEFINE_SPINLOCK(trace_buf_lock);
+ static char trace_buf[TRACE_BUF_SIZE];
+
+ struct ring_buffer_event *event;
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ struct print_entry *entry;
+ unsigned long flags, irq_flags;
+ int cpu, len = 0, size, pc;
+
+ if (!tr->ctrl || tracing_disabled)
+ return 0;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+
+ if (unlikely(atomic_read(&data->disabled)))
+ goto out;
+
+ spin_lock_irqsave(&trace_buf_lock, flags);
+ len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+ len = min(len, TRACE_BUF_SIZE-1);
+ trace_buf[len] = 0;
+
+ size = sizeof(*entry) + len + 1;
+ event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+ if (!event)
+ goto out_unlock;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_PRINT;
+ entry->ip = ip;
+
+ memcpy(&entry->buf, trace_buf, len);
+ entry->buf[len] = 0;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out_unlock:
+ spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+ out:
+ preempt_enable_notrace();
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
+int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vprintk(ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ftrace_printk);
+
+static int trace_panic_handler(struct notifier_block *this,
+ unsigned long event, void *unused)
+{
+ ftrace_dump();
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_panic_notifier = {
+ .notifier_call = trace_panic_handler,
+ .next = NULL,
+ .priority = 150 /* priority: INT_MAX >= x >= 0 */
+};
+
+static int trace_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ switch (val) {
+ case DIE_OOPS:
+ ftrace_dump();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_die_notifier = {
+ .notifier_call = trace_die_handler,
+ .priority = 200
+};
+
+/*
+ * printk is set to max of 1024, we really don't need it that big.
+ * Nothing should be printing 1000 characters anyway.
+ */
+#define TRACE_MAX_PRINT 1000
+
+/*
+ * Define here KERN_TRACE so that we have one place to modify
+ * it if we decide to change what log level the ftrace dump
+ * should be at.
+ */
+#define KERN_TRACE KERN_INFO
+
+static void
+trace_printk_seq(struct trace_seq *s)
+{
+ /* Probably should print a warning here. */
+ if (s->len >= 1000)
+ s->len = 1000;
+
+ /* should be zero ended, but we are paranoid. */
+ s->buffer[s->len] = 0;
+
+ printk(KERN_TRACE "%s", s->buffer);
+
+ trace_seq_reset(s);
+}
+
+
+void ftrace_dump(void)
+{
+ static DEFINE_SPINLOCK(ftrace_dump_lock);
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ static cpumask_t mask;
+ static int dump_ran;
+ unsigned long flags;
+ int cnt = 0, cpu;
+
+ /* only one dump */
+ spin_lock_irqsave(&ftrace_dump_lock, flags);
+ if (dump_ran)
+ goto out;
+
+ dump_ran = 1;
+
+ /* No turning back! */
+ ftrace_kill();
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ }
+
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+
+ iter.tr = &global_trace;
+ iter.trace = current_trace;
+
+ /*
+ * We need to stop all tracing on all CPUS to read the
+ * the next buffer. This is a bit expensive, but is
+ * not done often. We fill all what we can read,
+ * and then release the locks again.
+ */
+
+ cpus_clear(mask);
+
+ while (!trace_empty(&iter)) {
+
+ if (!cnt)
+ printk(KERN_TRACE "---------------------------------\n");
+
+ cnt++;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (find_next_entry_inc(&iter) != NULL) {
+ print_trace_line(&iter);
+ trace_consume(&iter);
+ }
+
+ trace_printk_seq(&iter.seq);
+ }
+
+ if (!cnt)
+ printk(KERN_TRACE " (ftrace buffer empty)\n");
+ else
+ printk(KERN_TRACE "---------------------------------\n");
+
+ out:
+ spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+ struct trace_array_cpu *data;
+ int i;
+
+ /* TODO: make the number of buffers hot pluggable with CPUS */
+ tracing_buffer_mask = cpu_possible_map;
+
+ global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!global_trace.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
+ WARN_ON(1);
+ return 0;
+ }
+ global_trace.entries = ring_buffer_size(global_trace.buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!max_tr.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
+ WARN_ON(1);
+ ring_buffer_free(global_trace.buffer);
+ return 0;
+ }
+ max_tr.entries = ring_buffer_size(max_tr.buffer);
+ WARN_ON(max_tr.entries != global_trace.entries);
+#endif
+
+ /* Allocate the first page for all buffers */
+ for_each_tracing_cpu(i) {
+ data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+ max_tr.data[i] = &per_cpu(max_data, i);
+ }
+
+ trace_init_cmdlines();
+
+ register_tracer(&nop_trace);
+#ifdef CONFIG_BOOT_TRACER
+ register_tracer(&boot_tracer);
+ current_trace = &boot_tracer;
+ current_trace->init(&global_trace);
+#else
+ current_trace = &nop_trace;
+#endif
+
+ /* All seems OK, enable tracing */
+ global_trace.ctrl = tracer_enabled;
+ tracing_disabled = 0;
+
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &trace_panic_notifier);
+
+ register_die_notifier(&trace_die_notifier);
+
+ return 0;
+}
+early_initcall(tracer_alloc_buffers);
+fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 0000000..8465ad0
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,422 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/ring_buffer.h>
+#include <linux/mmiotrace.h>
+#include <linux/ftrace.h>
+
+enum trace_type {
+ __TRACE_FIRST_TYPE = 0,
+
+ TRACE_FN,
+ TRACE_CTX,
+ TRACE_WAKE,
+ TRACE_CONT,
+ TRACE_STACK,
+ TRACE_PRINT,
+ TRACE_SPECIAL,
+ TRACE_MMIO_RW,
+ TRACE_MMIO_MAP,
+ TRACE_BOOT,
+
+ __TRACE_LAST_TYPE
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+ unsigned char type;
+ unsigned char cpu;
+ unsigned char flags;
+ unsigned char preempt_count;
+ int pid;
+};
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ unsigned long parent_ip;
+};
+extern struct tracer boot_tracer;
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+ struct trace_entry ent;
+ unsigned int prev_pid;
+ unsigned char prev_prio;
+ unsigned char prev_state;
+ unsigned int next_pid;
+ unsigned char next_prio;
+ unsigned char next_state;
+ unsigned int next_cpu;
+};
+
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+ struct trace_entry ent;
+ unsigned long arg1;
+ unsigned long arg2;
+ unsigned long arg3;
+};
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES 8
+
+struct stack_entry {
+ struct trace_entry ent;
+ unsigned long caller[FTRACE_STACK_ENTRIES];
+};
+
+/*
+ * ftrace_printk entry:
+ */
+struct print_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ char buf[];
+};
+
+#define TRACE_OLD_SIZE 88
+
+struct trace_field_cont {
+ unsigned char type;
+ /* Temporary till we get rid of this completely */
+ char buf[TRACE_OLD_SIZE - 1];
+};
+
+struct trace_mmiotrace_rw {
+ struct trace_entry ent;
+ struct mmiotrace_rw rw;
+};
+
+struct trace_mmiotrace_map {
+ struct trace_entry ent;
+ struct mmiotrace_map map;
+};
+
+struct trace_boot {
+ struct trace_entry ent;
+ struct boot_trace initcall;
+};
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ * IRQS_OFF - interrupts were disabled
+ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
+ * NEED_RESCED - reschedule is requested
+ * HARDIRQ - inside an interrupt handler
+ * SOFTIRQ - inside a softirq handler
+ * CONT - multiple entries hold the trace item
+ */
+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
+ TRACE_FLAG_NEED_RESCHED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_CONT = 0x20,
+};
+
+#define TRACE_BUF_SIZE 1024
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+ atomic_t disabled;
+
+ /* these fields get copied into max-trace: */
+ unsigned long trace_idx;
+ unsigned long overrun;
+ unsigned long saved_latency;
+ unsigned long critical_start;
+ unsigned long critical_end;
+ unsigned long critical_sequence;
+ unsigned long nice;
+ unsigned long policy;
+ unsigned long rt_priority;
+ cycle_t preempt_timestamp;
+ pid_t pid;
+ uid_t uid;
+ char comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+ struct ring_buffer *buffer;
+ unsigned long entries;
+ long ctrl;
+ int cpu;
+ cycle_t time_start;
+ struct task_struct *waiter;
+ struct trace_array_cpu *data[NR_CPUS];
+};
+
+#define FTRACE_CMP_TYPE(var, type) \
+ __builtin_types_compatible_p(typeof(var), type *)
+
+#undef IF_ASSIGN
+#define IF_ASSIGN(var, entry, etype, id) \
+ if (FTRACE_CMP_TYPE(var, etype)) { \
+ var = (typeof(var))(entry); \
+ WARN_ON(id && (entry)->type != id); \
+ break; \
+ }
+
+/* Will cause compile errors if type is not found. */
+extern void __ftrace_bad_type(void);
+
+/*
+ * The trace_assign_type is a verifier that the entry type is
+ * the same as the type being assigned. To add new types simply
+ * add a line with the following format:
+ *
+ * IF_ASSIGN(var, ent, type, id);
+ *
+ * Where "type" is the trace type that includes the trace_entry
+ * as the "ent" item. And "id" is the trace identifier that is
+ * used in the trace_type enum.
+ *
+ * If the type can have more than one id, then use zero.
+ */
+#define trace_assign_type(var, ent) \
+ do { \
+ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
+ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
+ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
+ IF_ASSIGN(var, ent, struct special_entry, 0); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
+ TRACE_MMIO_RW); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
+ TRACE_MMIO_MAP); \
+ IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ __ftrace_bad_type(); \
+ } while (0)
+
+/* Return values for print_line callback */
+enum print_line_t {
+ TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
+ TRACE_TYPE_HANDLED = 1,
+ TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+ const char *name;
+ void (*init)(struct trace_array *tr);
+ void (*reset)(struct trace_array *tr);
+ void (*open)(struct trace_iterator *iter);
+ void (*pipe_open)(struct trace_iterator *iter);
+ void (*close)(struct trace_iterator *iter);
+ void (*start)(struct trace_iterator *iter);
+ void (*stop)(struct trace_iterator *iter);
+ ssize_t (*read)(struct trace_iterator *iter,
+ struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+ void (*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ int (*selftest)(struct tracer *trace,
+ struct trace_array *tr);
+#endif
+ enum print_line_t (*print_line)(struct trace_iterator *iter);
+ struct tracer *next;
+ int print_max;
+};
+
+struct trace_seq {
+ unsigned char buffer[PAGE_SIZE];
+ unsigned int len;
+ unsigned int readpos;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+ struct trace_array *tr;
+ struct tracer *trace;
+ void *private;
+ struct ring_buffer_iter *buffer_iter[NR_CPUS];
+
+ /* The below is zeroed out in pipe_read */
+ struct trace_seq seq;
+ struct trace_entry *ent;
+ int cpu;
+ u64 ts;
+
+ unsigned long iter_flags;
+ loff_t pos;
+ long idx;
+};
+
+void trace_wake_up(void);
+void tracing_reset(struct trace_array *tr, int cpu);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+
+struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
+ struct trace_array_cpu *data);
+void tracing_generic_entry_update(struct trace_entry *entry,
+ unsigned long flags,
+ int pc);
+
+void ftrace(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
+void tracing_sched_switch_trace(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct task_struct *wakee,
+ struct task_struct *cur,
+ unsigned long flags, int pc);
+void trace_special(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3, int pc);
+void trace_function(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
+
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+ struct task_struct *tsk, int cpu);
+
+extern cycle_t ftrace_now(int cpu);
+
+#ifdef CONFIG_FUNCTION_TRACER
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+#else
+# define tracing_start_function_trace() do { } while (0)
+# define tracing_stop_function_trace() do { } while (0)
+#endif
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+ void *__rq,
+ struct task_struct *prev,
+ struct task_struct *next);
+
+struct tracer_switch_ops {
+ tracer_switch_func_t func;
+ void *private;
+ struct tracer_switch_ops *next;
+};
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+extern int trace_selftest_startup_function(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_nop(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+ struct trace_array *tr);
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern void trace_seq_print_cont(struct trace_seq *s,
+ struct trace_iterator *iter);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+ size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+
+extern unsigned long trace_flags;
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ * trace.c.
+ */
+enum trace_iterator_flags {
+ TRACE_ITER_PRINT_PARENT = 0x01,
+ TRACE_ITER_SYM_OFFSET = 0x02,
+ TRACE_ITER_SYM_ADDR = 0x04,
+ TRACE_ITER_VERBOSE = 0x08,
+ TRACE_ITER_RAW = 0x10,
+ TRACE_ITER_HEX = 0x20,
+ TRACE_ITER_BIN = 0x40,
+ TRACE_ITER_BLOCK = 0x80,
+ TRACE_ITER_STACKTRACE = 0x100,
+ TRACE_ITER_SCHED_TREE = 0x200,
+ TRACE_ITER_PRINTK = 0x400,
+};
+
+extern struct tracer nop_trace;
+
+#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 0000000..d0a5e50
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,126 @@
+/*
+ * ring buffer based initcalls tracer
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include "trace.h"
+
+static struct trace_array *boot_trace;
+static int trace_boot_enabled;
+
+
+/* Should be started after do_pre_smp_initcalls() in init/main.c */
+void start_boot_trace(void)
+{
+ trace_boot_enabled = 1;
+}
+
+void stop_boot_trace(void)
+{
+ trace_boot_enabled = 0;
+}
+
+void reset_boot_trace(struct trace_array *tr)
+{
+ stop_boot_trace();
+}
+
+static void boot_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ boot_trace = tr;
+
+ trace_boot_enabled = 0;
+
+ for_each_cpu_mask(cpu, cpu_possible_map)
+ tracing_reset(tr, cpu);
+}
+
+static void boot_trace_ctrl_update(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_boot_trace();
+ else
+ stop_boot_trace();
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+ int ret;
+ struct trace_entry *entry = iter->ent;
+ struct trace_boot *field = (struct trace_boot *)entry;
+ struct boot_trace *it = &field->initcall;
+ struct trace_seq *s = &iter->seq;
+ struct timespec calltime = ktime_to_timespec(it->calltime);
+ struct timespec rettime = ktime_to_timespec(it->rettime);
+
+ if (entry->type == TRACE_BOOT) {
+ ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
+ calltime.tv_sec,
+ calltime.tv_nsec,
+ it->func, it->caller);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+ "returned %d after %lld msecs\n",
+ rettime.tv_sec,
+ rettime.tv_nsec,
+ it->func, it->result, it->duration);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+ }
+ return TRACE_TYPE_UNHANDLED;
+}
+
+struct tracer boot_tracer __read_mostly =
+{
+ .name = "initcall",
+ .init = boot_trace_init,
+ .reset = reset_boot_trace,
+ .ctrl_update = boot_trace_ctrl_update,
+ .print_line = initcall_print_line,
+};
+
+void trace_boot(struct boot_trace *it, initcall_t fn)
+{
+ struct ring_buffer_event *event;
+ struct trace_boot *entry;
+ struct trace_array_cpu *data;
+ unsigned long irq_flags;
+ struct trace_array *tr = boot_trace;
+
+ if (!trace_boot_enabled)
+ return;
+
+ /* Get its name now since this function could
+ * disappear because it is in the .init section.
+ */
+ sprint_symbol(it->func, (unsigned long)fn);
+ preempt_disable();
+ data = tr->data[smp_processor_id()];
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+ entry->ent.type = TRACE_BOOT;
+ entry->initcall = *it;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+
+ out:
+ preempt_enable();
+}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 0000000..0f85a64
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,81 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void function_reset(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static void start_function_trace(struct trace_array *tr)
+{
+ tr->cpu = get_cpu();
+ function_reset(tr);
+ put_cpu();
+
+ tracing_start_cmdline_record();
+ tracing_start_function_trace();
+}
+
+static void stop_function_trace(struct trace_array *tr)
+{
+ tracing_stop_function_trace();
+ tracing_stop_cmdline_record();
+}
+
+static void function_trace_init(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_function_trace(tr);
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_function_trace(tr);
+}
+
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_function_trace(tr);
+ else
+ stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+ .name = "function",
+ .init = function_trace_init,
+ .reset = function_trace_reset,
+ .ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_function,
+#endif
+};
+
+static __init int init_function_trace(void)
+{
+ return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 0000000..9c74071
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,493 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array *irqsoff_trace __read_mostly;
+static int tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_SPINLOCK(max_trace_lock);
+
+enum {
+ TRACER_IRQS_OFF = (1 << 1),
+ TRACER_PREEMPT_OFF = (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+ return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+ return ((trace_type & TRACER_IRQS_OFF) &&
+ irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+#ifdef CONFIG_FUNCTION_TRACER
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+
+ /*
+ * Does not matter if we preempt. We test the flags
+ * afterward, to see if irqs are disabled or not.
+ * If we preempt and get a false positive, the flags
+ * test will fail.
+ */
+ cpu = raw_smp_processor_id();
+ if (likely(!per_cpu(tracing_cpu, cpu)))
+ return;
+
+ local_save_flags(flags);
+ /* slight chance to get a false positive on tracing_cpu */
+ if (!irqs_disabled_flags(flags))
+ return;
+
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1))
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+
+ atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FUNCTION_TRACER */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+ if (tracing_thresh) {
+ if (delta < tracing_thresh)
+ return 0;
+ } else {
+ if (delta <= tracing_max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long parent_ip,
+ int cpu)
+{
+ unsigned long latency, t0, t1;
+ cycle_t T0, T1, delta;
+ unsigned long flags;
+ int pc;
+
+ /*
+ * usecs conversion is slow so we try to delay the conversion
+ * as long as possible:
+ */
+ T0 = data->preempt_timestamp;
+ T1 = ftrace_now(cpu);
+ delta = T1-T0;
+
+ local_save_flags(flags);
+
+ pc = preempt_count();
+
+ if (!report_latency(delta))
+ goto out;
+
+ spin_lock_irqsave(&max_trace_lock, flags);
+
+ /* check if we are still the max latency */
+ if (!report_latency(delta))
+ goto out_unlock;
+
+ trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+
+ latency = nsecs_to_usecs(delta);
+
+ if (data->critical_sequence != max_sequence)
+ goto out_unlock;
+
+ tracing_max_latency = delta;
+ t0 = nsecs_to_usecs(T0);
+ t1 = nsecs_to_usecs(T1);
+
+ data->critical_end = parent_ip;
+
+ update_max_tr_single(tr, current, cpu);
+
+ max_sequence++;
+
+out_unlock:
+ spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ tracing_reset(tr, cpu);
+ trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu;
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ if (likely(!tracer_enabled))
+ return;
+
+ cpu = raw_smp_processor_id();
+
+ if (per_cpu(tracing_cpu, cpu))
+ return;
+
+ data = tr->data[cpu];
+
+ if (unlikely(!data) || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ data->critical_start = parent_ip ? : ip;
+ tracing_reset(tr, cpu);
+
+ local_save_flags(flags);
+
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+
+ per_cpu(tracing_cpu, cpu) = 1;
+
+ atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu;
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ cpu = raw_smp_processor_id();
+ /* Always clear the tracing cpu on stopping the trace */
+ if (unlikely(per_cpu(tracing_cpu, cpu)))
+ per_cpu(tracing_cpu, cpu) = 0;
+ else
+ return;
+
+ if (!tracer_enabled)
+ return;
+
+ data = tr->data[cpu];
+
+ if (unlikely(!data) ||
+ !data->critical_start || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+
+ local_save_flags(flags);
+ trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+ check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+ data->critical_start = 0;
+ atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+ if (preempt_trace() || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(start_critical_timings);
+
+void stop_critical_timings(void)
+{
+ if (preempt_trace() || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(stop_critical_timings);
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ if (preempt_trace())
+ stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ if (preempt_trace())
+ start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+ register_ftrace_function(&trace_ops);
+ tracer_enabled = 1;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ unregister_ftrace_function(&trace_ops);
+}
+
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+ irqsoff_trace = tr;
+ /* make sure that the tracer is visible */
+ smp_wmb();
+
+ if (tr->ctrl)
+ start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_irqsoff_tracer(tr);
+ else
+ stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl)
+ stop_irqsoff_tracer(iter->tr);
+}
+
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+ if (iter->tr->ctrl)
+ start_irqsoff_tracer(iter->tr);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_IRQS_OFF;
+
+ __irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+ .name = "irqsoff",
+ .init = irqsoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .open = irqsoff_tracer_open,
+ .close = irqsoff_tracer_close,
+ .ctrl_update = irqsoff_tracer_ctrl_update,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_PREEMPT_OFF;
+
+ __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+ .name = "preemptoff",
+ .init = preemptoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .open = irqsoff_tracer_open,
+ .close = irqsoff_tracer_close,
+ .ctrl_update = irqsoff_tracer_ctrl_update,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+ defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+ __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+ .name = "preemptirqsoff",
+ .init = preemptirqsoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .open = irqsoff_tracer_open,
+ .close = irqsoff_tracer_close,
+ .ctrl_update = irqsoff_tracer_ctrl_update,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+ register_irqsoff(irqsoff_tracer);
+ register_preemptoff(preemptoff_tracer);
+ register_preemptirqsoff(preemptirqsoff_tracer);
+
+ return 0;
+}
+device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 0000000..e62cbf7
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,375 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+
+#include "trace.h"
+
+struct header_iter {
+ struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+static unsigned long prev_overruns;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+ int cpu;
+
+ overrun_detected = false;
+ prev_overruns = 0;
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ mmio_trace_array = tr;
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ }
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl)
+ disable_mmiotrace();
+ mmio_reset_data(tr);
+ mmio_trace_array = NULL;
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ } else {
+ disable_mmiotrace();
+ }
+}
+
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+ int ret = 0;
+ int i;
+ resource_size_t start, end;
+ const struct pci_driver *drv = pci_dev_driver(dev);
+
+ /* XXX: incomplete checks for trace_seq_printf() return value */
+ ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
+ /*
+ * XXX: is pci_resource_to_user() appropriate, since we are
+ * supposed to interpret the __ioremap() phys_addr argument based on
+ * these printed values?
+ */
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ (unsigned long long)(start |
+ (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+ }
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ dev->resource[i].start < dev->resource[i].end ?
+ (unsigned long long)(end - start) + 1 : 0);
+ }
+ if (drv)
+ ret += trace_seq_printf(s, " %s\n", drv->name);
+ else
+ ret += trace_seq_printf(s, " \n");
+ return ret;
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+ if (!hiter)
+ return;
+ pci_dev_put(hiter->dev);
+ kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+ struct header_iter *hiter;
+ struct trace_seq *s = &iter->seq;
+
+ trace_seq_printf(s, "VERSION 20070824\n");
+
+ hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+ if (!hiter)
+ return;
+
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+ iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+ struct header_iter *hiter = iter->private;
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+ unsigned long cnt = 0;
+ unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+
+ if (over > prev_overruns)
+ cnt = over - prev_overruns;
+ prev_overruns = over;
+ return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+ char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ ssize_t ret;
+ struct header_iter *hiter = iter->private;
+ struct trace_seq *s = &iter->seq;
+ unsigned long n;
+
+ n = count_overruns(iter);
+ if (n) {
+ /* XXX: This is later than where events were lost. */
+ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+ if (!overrun_detected)
+ pr_warning("mmiotrace has lost events.\n");
+ overrun_detected = true;
+ goto print_out;
+ }
+
+ if (!hiter)
+ return 0;
+
+ mmio_print_pcidev(s, hiter->dev);
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+ if (!hiter->dev) {
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+ }
+
+print_out:
+ ret = trace_seq_to_user(s, ubuf, cnt);
+ return (ret == -EBUSY) ? 0 : ret;
+}
+
+static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_mmiotrace_rw *field;
+ struct mmiotrace_rw *rw;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret = 1;
+
+ trace_assign_type(field, entry);
+ rw = &field->rw;
+
+ switch (rw->opcode) {
+ case MMIO_READ:
+ ret = trace_seq_printf(s,
+ "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_WRITE:
+ ret = trace_seq_printf(s,
+ "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_UNKNOWN_OP:
+ ret = trace_seq_printf(s,
+ "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+ secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+ (rw->value >> 0) & 0xff, rw->pc, 0);
+ break;
+ default:
+ ret = trace_seq_printf(s, "rw what?\n");
+ break;
+ }
+ if (ret)
+ return TRACE_TYPE_HANDLED;
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t mmio_print_map(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_mmiotrace_map *field;
+ struct mmiotrace_map *m;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret;
+
+ trace_assign_type(field, entry);
+ m = &field->map;
+
+ switch (m->opcode) {
+ case MMIO_PROBE:
+ ret = trace_seq_printf(s,
+ "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+ secs, usec_rem, m->map_id,
+ (unsigned long long)m->phys, m->virt, m->len,
+ 0UL, 0);
+ break;
+ case MMIO_UNPROBE:
+ ret = trace_seq_printf(s,
+ "UNMAP %lu.%06lu %d 0x%lx %d\n",
+ secs, usec_rem, m->map_id, 0UL, 0);
+ break;
+ default:
+ ret = trace_seq_printf(s, "map what?\n");
+ break;
+ }
+ if (ret)
+ return TRACE_TYPE_HANDLED;
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *print = (struct print_entry *)entry;
+ const char *msg = print->buf;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret;
+
+ /* The trailing newline must be in the message. */
+ ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t mmio_print_line(struct trace_iterator *iter)
+{
+ switch (iter->ent->type) {
+ case TRACE_MMIO_RW:
+ return mmio_print_rw(iter);
+ case TRACE_MMIO_MAP:
+ return mmio_print_map(iter);
+ case TRACE_PRINT:
+ return mmio_print_mark(iter);
+ default:
+ return TRACE_TYPE_HANDLED; /* ignore unknown entries */
+ }
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+ .name = "mmiotrace",
+ .init = mmio_trace_init,
+ .reset = mmio_trace_reset,
+ .pipe_open = mmio_pipe_open,
+ .close = mmio_close,
+ .read = mmio_read,
+ .ctrl_update = mmio_trace_ctrl_update,
+ .print_line = mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+ return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+static void __trace_mmiotrace_rw(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw)
+{
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_rw *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, preempt_count());
+ entry->ent.type = TRACE_MMIO_RW;
+ entry->rw = *rw;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_rw(tr, data, rw);
+}
+
+static void __trace_mmiotrace_map(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_map *map)
+{
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_map *entry;
+ unsigned long irq_flags;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, preempt_count());
+ entry->ent.type = TRACE_MMIO_MAP;
+ entry->map = *map;
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ trace_wake_up();
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data;
+
+ preempt_disable();
+ data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_map(tr, data, map);
+ preempt_enable();
+}
+
+int mmio_trace_printk(const char *fmt, va_list args)
+{
+ return trace_vprintk(0, fmt, args);
+}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 0000000..4592b48
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,64 @@
+/*
+ * nop tracer
+ *
+ * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array *ctx_trace;
+
+static void start_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static void stop_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static void nop_trace_init(struct trace_array *tr)
+{
+ int cpu;
+ ctx_trace = tr;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ if (tr->ctrl)
+ start_nop_trace(tr);
+}
+
+static void nop_trace_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_nop_trace(tr);
+}
+
+static void nop_trace_ctrl_update(struct trace_array *tr)
+{
+ /* When starting a new trace, reset the buffers */
+ if (tr->ctrl)
+ start_nop_trace(tr);
+ else
+ stop_nop_trace(tr);
+}
+
+struct tracer nop_trace __read_mostly =
+{
+ .name = "nop",
+ .init = nop_trace_init,
+ .reset = nop_trace_reset,
+ .ctrl_update = nop_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_nop,
+#endif
+};
+
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 0000000..b8f56be
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,211 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <trace/sched.h>
+
+#include "trace.h"
+
+static struct trace_array *ctx_trace;
+static int __read_mostly tracer_enabled;
+static atomic_t sched_ref;
+
+static void
+probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int cpu;
+ int pc;
+
+ if (!atomic_read(&sched_ref))
+ return;
+
+ tracing_record_cmdline(prev);
+ tracing_record_cmdline(next);
+
+ if (!tracer_enabled)
+ return;
+
+ pc = preempt_count();
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = ctx_trace->data[cpu];
+
+ if (likely(!atomic_read(&data->disabled)))
+ tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
+
+ local_irq_restore(flags);
+}
+
+static void
+probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
+{
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int cpu, pc;
+
+ if (!likely(tracer_enabled))
+ return;
+
+ pc = preempt_count();
+ tracing_record_cmdline(current);
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = ctx_trace->data[cpu];
+
+ if (likely(!atomic_read(&data->disabled)))
+ tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+ flags, pc);
+
+ local_irq_restore(flags);
+}
+
+static void sched_switch_reset(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static int tracing_sched_register(void)
+{
+ int ret;
+
+ ret = register_trace_sched_wakeup(probe_sched_wakeup);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup\n");
+ return ret;
+ }
+
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup_new\n");
+ goto fail_deprobe;
+ }
+
+ ret = register_trace_sched_switch(probe_sched_switch);
+ if (ret) {
+ pr_info("sched trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_schedule\n");
+ goto fail_deprobe_wake_new;
+ }
+
+ return ret;
+fail_deprobe_wake_new:
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+fail_deprobe:
+ unregister_trace_sched_wakeup(probe_sched_wakeup);
+ return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+ unregister_trace_sched_switch(probe_sched_switch);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup);
+}
+
+static void tracing_start_sched_switch(void)
+{
+ long ref;
+
+ ref = atomic_inc_return(&sched_ref);
+ if (ref == 1)
+ tracing_sched_register();
+}
+
+static void tracing_stop_sched_switch(void)
+{
+ long ref;
+
+ ref = atomic_dec_and_test(&sched_ref);
+ if (ref)
+ tracing_sched_unregister();
+}
+
+void tracing_start_cmdline_record(void)
+{
+ tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+ tracing_stop_sched_switch();
+}
+
+static void start_sched_trace(struct trace_array *tr)
+{
+ sched_switch_reset(tr);
+ tracing_start_cmdline_record();
+ tracer_enabled = 1;
+}
+
+static void stop_sched_trace(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ tracing_stop_cmdline_record();
+}
+
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+ ctx_trace = tr;
+
+ if (tr->ctrl)
+ start_sched_trace(tr);
+}
+
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+ /* When starting a new trace, reset the buffers */
+ if (tr->ctrl)
+ start_sched_trace(tr);
+ else
+ stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+ .name = "sched_switch",
+ .init = sched_switch_trace_init,
+ .reset = sched_switch_trace_reset,
+ .ctrl_update = sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_sched_switch,
+#endif
+};
+
+__init static int init_sched_switch_trace(void)
+{
+ int ret = 0;
+
+ if (atomic_read(&sched_ref))
+ ret = tracing_sched_register();
+ if (ret) {
+ pr_info("error registering scheduler trace\n");
+ return ret;
+ }
+ return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 0000000..3ae93f1
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,393 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <trace/sched.h>
+
+#include "trace.h"
+
+static struct trace_array *wakeup_trace;
+static int __read_mostly tracer_enabled;
+
+static struct task_struct *wakeup_task;
+static int wakeup_cpu;
+static unsigned wakeup_prio = -1;
+
+static raw_spinlock_t wakeup_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static void __wakeup_reset(struct trace_array *tr);
+
+#ifdef CONFIG_FUNCTION_TRACER
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int resched;
+ int cpu;
+ int pc;
+
+ if (likely(!wakeup_task))
+ return;
+
+ pc = preempt_count();
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (unlikely(disabled != 1))
+ goto out;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
+
+ if (unlikely(!wakeup_task))
+ goto unlock;
+
+ /*
+ * The task can't disappear because it needs to
+ * wake up first, and we have the wakeup_lock.
+ */
+ if (task_cpu(wakeup_task) != cpu)
+ goto unlock;
+
+ trace_function(tr, data, ip, parent_ip, flags, pc);
+
+ unlock:
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
+
+ out:
+ atomic_dec(&data->disabled);
+
+ /*
+ * To prevent recursion from the scheduler, if the
+ * resched flag was set before we entered, then
+ * don't reschedule.
+ */
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FUNCTION_TRACER */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+ if (tracing_thresh) {
+ if (delta < tracing_thresh)
+ return 0;
+ } else {
+ if (delta <= tracing_max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+static void notrace
+probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long latency = 0, t0 = 0, t1 = 0;
+ struct trace_array_cpu *data;
+ cycle_t T0, T1, delta;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ tracing_record_cmdline(prev);
+
+ if (unlikely(!tracer_enabled))
+ return;
+
+ /*
+ * When we start a new trace, we set wakeup_task to NULL
+ * and then set tracer_enabled = 1. We want to make sure
+ * that another CPU does not see the tracer_enabled = 1
+ * and the wakeup_task with an older task, that might
+ * actually be the same as next.
+ */
+ smp_rmb();
+
+ if (next != wakeup_task)
+ return;
+
+ pc = preempt_count();
+
+ /* The task we are waiting for is waking up */
+ data = wakeup_trace->data[wakeup_cpu];
+
+ /* disable local data, not wakeup_cpu data */
+ cpu = raw_smp_processor_id();
+ disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ if (likely(disabled != 1))
+ goto out;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
+
+ /* We could race with grabbing wakeup_lock */
+ if (unlikely(!tracer_enabled || next != wakeup_task))
+ goto out_unlock;
+
+ trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+
+ /*
+ * usecs conversion is slow so we try to delay the conversion
+ * as long as possible:
+ */
+ T0 = data->preempt_timestamp;
+ T1 = ftrace_now(cpu);
+ delta = T1-T0;
+
+ if (!report_latency(delta))
+ goto out_unlock;
+
+ latency = nsecs_to_usecs(delta);
+
+ tracing_max_latency = delta;
+ t0 = nsecs_to_usecs(T0);
+ t1 = nsecs_to_usecs(T1);
+
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+
+out_unlock:
+ __wakeup_reset(wakeup_trace);
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
+out:
+ atomic_dec(&wakeup_trace->data[cpu]->disabled);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+ struct trace_array_cpu *data;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ data = tr->data[cpu];
+ tracing_reset(tr, cpu);
+ }
+
+ wakeup_cpu = -1;
+ wakeup_prio = -1;
+
+ if (wakeup_task)
+ put_task_struct(wakeup_task);
+
+ wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
+ __wakeup_reset(tr);
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
+}
+
+static void
+probe_wakeup(struct rq *rq, struct task_struct *p)
+{
+ int cpu = smp_processor_id();
+ unsigned long flags;
+ long disabled;
+ int pc;
+
+ if (likely(!tracer_enabled))
+ return;
+
+ tracing_record_cmdline(p);
+ tracing_record_cmdline(current);
+
+ if (likely(!rt_task(p)) ||
+ p->prio >= wakeup_prio ||
+ p->prio >= current->prio)
+ return;
+
+ pc = preempt_count();
+ disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ if (unlikely(disabled != 1))
+ goto out;
+
+ /* interrupts should be off from try_to_wake_up */
+ __raw_spin_lock(&wakeup_lock);
+
+ /* check for races. */
+ if (!tracer_enabled || p->prio >= wakeup_prio)
+ goto out_locked;
+
+ /* reset the trace */
+ __wakeup_reset(wakeup_trace);
+
+ wakeup_cpu = task_cpu(p);
+ wakeup_prio = p->prio;
+
+ wakeup_task = p;
+ get_task_struct(wakeup_task);
+
+ local_save_flags(flags);
+
+ wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+ trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
+ CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+
+out_locked:
+ __raw_spin_unlock(&wakeup_lock);
+out:
+ atomic_dec(&wakeup_trace->data[cpu]->disabled);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+ int ret;
+
+ ret = register_trace_sched_wakeup(probe_wakeup);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup\n");
+ return;
+ }
+
+ ret = register_trace_sched_wakeup_new(probe_wakeup);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup_new\n");
+ goto fail_deprobe;
+ }
+
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+ if (ret) {
+ pr_info("sched trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_schedule\n");
+ goto fail_deprobe_wake_new;
+ }
+
+ wakeup_reset(tr);
+
+ /*
+ * Don't let the tracer_enabled = 1 show up before
+ * the wakeup_task is reset. This may be overkill since
+ * wakeup_reset does a spin_unlock after setting the
+ * wakeup_task to NULL, but I want to be safe.
+ * This is a slow path anyway.
+ */
+ smp_wmb();
+
+ register_ftrace_function(&trace_ops);
+
+ tracer_enabled = 1;
+
+ return;
+fail_deprobe_wake_new:
+ unregister_trace_sched_wakeup_new(probe_wakeup);
+fail_deprobe:
+ unregister_trace_sched_wakeup(probe_wakeup);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ unregister_ftrace_function(&trace_ops);
+ unregister_trace_sched_switch(probe_wakeup_sched_switch);
+ unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup);
+}
+
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+ wakeup_trace = tr;
+
+ if (tr->ctrl)
+ start_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+ if (tr->ctrl) {
+ stop_wakeup_tracer(tr);
+ /* make sure we put back any tasks we are tracing */
+ wakeup_reset(tr);
+ }
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ start_wakeup_tracer(tr);
+ else
+ stop_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl)
+ stop_wakeup_tracer(iter->tr);
+}
+
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+ /* forget about any processes we were recording */
+ if (iter->tr->ctrl)
+ start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+ .name = "wakeup",
+ .init = wakeup_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .open = wakeup_tracer_open,
+ .close = wakeup_tracer_close,
+ .ctrl_update = wakeup_tracer_ctrl_update,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+};
+
+__init static int init_wakeup_tracer(void)
+{
+ int ret;
+
+ ret = register_tracer(&wakeup_tracer);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 0000000..90bc752
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,524 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+ switch (entry->type) {
+ case TRACE_FN:
+ case TRACE_CTX:
+ case TRACE_WAKE:
+ case TRACE_CONT:
+ case TRACE_STACK:
+ case TRACE_PRINT:
+ case TRACE_SPECIAL:
+ return 1;
+ }
+ return 0;
+}
+
+static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+{
+ struct ring_buffer_event *event;
+ struct trace_entry *entry;
+
+ while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+ entry = ring_buffer_event_data(event);
+
+ if (!trace_valid_entry(entry)) {
+ printk(KERN_CONT ".. invalid entry %d ",
+ entry->type);
+ goto failed;
+ }
+ }
+ return 0;
+
+ failed:
+ /* disable tracing */
+ tracing_disabled = 1;
+ printk(KERN_CONT ".. corrupted trace buffer .. ");
+ return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+ unsigned long flags, cnt = 0;
+ int cpu, ret = 0;
+
+ /* Don't allow flipping of max traces now */
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&ftrace_max_lock);
+
+ cnt = ring_buffer_entries(tr->buffer);
+
+ for_each_possible_cpu(cpu) {
+ ret = trace_test_buffer_cpu(tr, cpu);
+ if (ret)
+ break;
+ }
+ __raw_spin_unlock(&ftrace_max_lock);
+ raw_local_irq_restore(flags);
+
+ if (count)
+ *count = cnt;
+
+ return ret;
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+ struct trace_array *tr,
+ int (*func)(void))
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ int save_tracer_enabled = tracer_enabled;
+ unsigned long count;
+ char *func_name;
+ int ret;
+
+ /* The ftrace test PASSED */
+ printk(KERN_CONT "PASSED\n");
+ pr_info("Testing dynamic ftrace: ");
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+ tracer_enabled = 1;
+
+ /* passed in by parameter to fool gcc from optimizing */
+ func();
+
+ /*
+ * Some archs *cough*PowerPC*cough* add charachters to the
+ * start of the function names. We simply put a '*' to
+ * accomodate them.
+ */
+ func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
+ /* filter only on our function */
+ ftrace_set_filter(func_name, strlen(func_name), 1);
+
+ /* enable tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+
+ /* we should have nothing in the buffer */
+ ret = trace_test_buffer(tr, &count);
+ if (ret)
+ goto out;
+
+ if (count) {
+ ret = -1;
+ printk(KERN_CONT ".. filter did not filter .. ");
+ goto out;
+ }
+
+ /* call our function again */
+ func();
+
+ /* sleep again */
+ msleep(100);
+
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ ftrace_enabled = 0;
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+
+ /* we should only have one item */
+ if (!ret && count != 1) {
+ printk(KERN_CONT ".. filter failed count=%ld ..", count);
+ ret = -1;
+ goto out;
+ }
+ out:
+ ftrace_enabled = save_ftrace_enabled;
+ tracer_enabled = save_tracer_enabled;
+
+ /* Enable tracing on all functions again */
+ ftrace_set_filter(NULL, 0, 1);
+
+ return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ int save_tracer_enabled = tracer_enabled;
+ unsigned long count;
+ int ret;
+
+ /* make sure msleep has been recorded */
+ msleep(1);
+
+ /* start the tracing */
+ ftrace_enabled = 1;
+ tracer_enabled = 1;
+
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ ftrace_enabled = 0;
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+ DYN_FTRACE_TEST_NAME);
+
+ out:
+ ftrace_enabled = save_ftrace_enabled;
+ tracer_enabled = save_tracer_enabled;
+
+ /* kill ftrace totally if we failed */
+ if (ret)
+ ftrace_kill();
+
+ return ret;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tracing_max_latency;
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* reset the max latency */
+ tracing_max_latency = 0;
+ /* disable interrupts for a bit */
+ local_irq_disable();
+ udelay(100);
+ local_irq_enable();
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check both trace buffers */
+ ret = trace_test_buffer(tr, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&max_tr, &count);
+ trace->reset(tr);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ tracing_max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tracing_max_latency;
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* reset the max latency */
+ tracing_max_latency = 0;
+ /* disable preemption for a bit */
+ preempt_disable();
+ udelay(100);
+ preempt_enable();
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check both trace buffers */
+ ret = trace_test_buffer(tr, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&max_tr, &count);
+ trace->reset(tr);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ tracing_max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tracing_max_latency;
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+
+ /* reset the max latency */
+ tracing_max_latency = 0;
+
+ /* disable preemption and interrupts for a bit */
+ preempt_disable();
+ local_irq_disable();
+ udelay(100);
+ preempt_enable();
+ /* reverse the order of preempt vs irqs */
+ local_irq_enable();
+
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check both trace buffers */
+ ret = trace_test_buffer(tr, NULL);
+ if (ret)
+ goto out;
+
+ ret = trace_test_buffer(&max_tr, &count);
+ if (ret)
+ goto out;
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ /* do the test by disabling interrupts first this time */
+ tracing_max_latency = 0;
+ tr->ctrl = 1;
+ trace->ctrl_update(tr);
+ preempt_disable();
+ local_irq_disable();
+ udelay(100);
+ preempt_enable();
+ /* reverse the order of preempt vs irqs */
+ local_irq_enable();
+
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check both trace buffers */
+ ret = trace_test_buffer(tr, NULL);
+ if (ret)
+ goto out;
+
+ ret = trace_test_buffer(&max_tr, &count);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ out:
+ trace->reset(tr);
+ tracing_max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_NOP_TRACER
+int
+trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
+{
+ /* What could possibly go wrong? */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+ /* Make this a RT thread, doesn't need to be too high */
+ struct sched_param param = { .sched_priority = 5 };
+ struct completion *x = data;
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+
+ /* Make it know we have a new prio */
+ complete(x);
+
+ /* now go to sleep and let the test wake us up */
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+ /* we are awake, now wait to disappear */
+ while (!kthread_should_stop()) {
+ /*
+ * This is an RT task, do short sleeps to let
+ * others run.
+ */
+ msleep(100);
+ }
+
+ return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tracing_max_latency;
+ struct task_struct *p;
+ struct completion isrt;
+ unsigned long count;
+ int ret;
+
+ init_completion(&isrt);
+
+ /* create a high prio thread */
+ p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+ if (IS_ERR(p)) {
+ printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+ return -1;
+ }
+
+ /* make sure the thread is running at an RT prio */
+ wait_for_completion(&isrt);
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* reset the max latency */
+ tracing_max_latency = 0;
+
+ /* sleep to let the RT thread sleep too */
+ msleep(100);
+
+ /*
+ * Yes this is slightly racy. It is possible that for some
+ * strange reason that the RT thread we created, did not
+ * call schedule for 100ms after doing the completion,
+ * and we do a wakeup on a task that already is awake.
+ * But that is extremely unlikely, and the worst thing that
+ * happens in such a case, is that we disable tracing.
+ * Honestly, if this race does happen something is horrible
+ * wrong with the system.
+ */
+
+ wake_up_process(p);
+
+ /* give a little time to let the thread wake up */
+ msleep(100);
+
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check both trace buffers */
+ ret = trace_test_buffer(tr, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&max_tr, &count);
+
+
+ trace->reset(tr);
+
+ tracing_max_latency = save_max;
+
+ /* kill the thread */
+ kthread_stop(p);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ tr->ctrl = 1;
+ trace->init(tr);
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tr->ctrl = 0;
+ trace->ctrl_update(tr);
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+
+ return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 0000000..54dd77c
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+ /* used to call mcount */
+ return 0;
+}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 0000000..3bdb44b
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/stacktrace.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "trace.h"
+
+#define STACK_TRACE_ENTRIES 500
+
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
+ { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
+static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+
+static struct stack_trace max_stack_trace = {
+ .max_entries = STACK_TRACE_ENTRIES,
+ .entries = stack_dump_trace,
+};
+
+static unsigned long max_stack_size;
+static raw_spinlock_t max_stack_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int stack_trace_disabled __read_mostly;
+static DEFINE_PER_CPU(int, trace_active);
+
+static inline void check_stack(void)
+{
+ unsigned long this_size, flags;
+ unsigned long *p, *top, *start;
+ int i;
+
+ this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = THREAD_SIZE - this_size;
+
+ if (this_size <= max_stack_size)
+ return;
+
+ /* we do not handle interrupt stacks yet */
+ if (!object_is_on_stack(&this_size))
+ return;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&max_stack_lock);
+
+ /* a race could have already updated it */
+ if (this_size <= max_stack_size)
+ goto out;
+
+ max_stack_size = this_size;
+
+ max_stack_trace.nr_entries = 0;
+ max_stack_trace.skip = 3;
+
+ save_stack_trace(&max_stack_trace);
+
+ /*
+ * Now find where in the stack these are.
+ */
+ i = 0;
+ start = &this_size;
+ top = (unsigned long *)
+ (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
+
+ /*
+ * Loop through all the entries. One of the entries may
+ * for some reason be missed on the stack, so we may
+ * have to account for them. If they are all there, this
+ * loop will only happen once. This code only takes place
+ * on a new max, so it is far from a fast path.
+ */
+ while (i < max_stack_trace.nr_entries) {
+
+ stack_dump_index[i] = this_size;
+ p = start;
+
+ for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ if (*p == stack_dump_trace[i]) {
+ this_size = stack_dump_index[i++] =
+ (top - p) * sizeof(unsigned long);
+ /* Start the search from here */
+ start = p + 1;
+ }
+ }
+
+ i++;
+ }
+
+ out:
+ __raw_spin_unlock(&max_stack_lock);
+ raw_local_irq_restore(flags);
+}
+
+static void
+stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu, resched;
+
+ if (unlikely(!ftrace_enabled || stack_trace_disabled))
+ return;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ /* no atomic needed, we only modify this variable by this cpu */
+ if (per_cpu(trace_active, cpu)++ != 0)
+ goto out;
+
+ check_stack();
+
+ out:
+ per_cpu(trace_active, cpu)--;
+ /* prevent recursion in schedule */
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = stack_trace_call,
+};
+
+static ssize_t
+stack_max_size_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long *ptr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
+ if (r > sizeof(buf))
+ r = sizeof(buf);
+ return simple_read_from_buffer(ubuf, count, ppos, buf, r);
+}
+
+static ssize_t
+stack_max_size_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ long *ptr = filp->private_data;
+ unsigned long val, flags;
+ char buf[64];
+ int ret;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&max_stack_lock);
+ *ptr = val;
+ __raw_spin_unlock(&max_stack_lock);
+ raw_local_irq_restore(flags);
+
+ return count;
+}
+
+static struct file_operations stack_max_size_fops = {
+ .open = tracing_open_generic,
+ .read = stack_max_size_read,
+ .write = stack_max_size_write,
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ long i;
+
+ (*pos)++;
+
+ if (v == SEQ_START_TOKEN)
+ i = 0;
+ else {
+ i = *(long *)v;
+ i++;
+ }
+
+ if (i >= max_stack_trace.nr_entries ||
+ stack_dump_trace[i] == ULONG_MAX)
+ return NULL;
+
+ m->private = (void *)i;
+
+ return &m->private;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = SEQ_START_TOKEN;
+ loff_t l = 0;
+
+ local_irq_disable();
+ __raw_spin_lock(&max_stack_lock);
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ __raw_spin_unlock(&max_stack_lock);
+ local_irq_enable();
+}
+
+static int trace_lookup_stack(struct seq_file *m, long i)
+{
+ unsigned long addr = stack_dump_trace[i];
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(str, addr);
+
+ return seq_printf(m, "%s\n", str);
+#else
+ return seq_printf(m, "%p\n", (void*)addr);
+#endif
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ long i;
+ int size;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, " Depth Size Location"
+ " (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries);
+ return 0;
+ }
+
+ i = *(long *)v;
+
+ if (i >= max_stack_trace.nr_entries ||
+ stack_dump_trace[i] == ULONG_MAX)
+ return 0;
+
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
+
+ trace_lookup_stack(m, i);
+
+ return 0;
+}
+
+static struct seq_operations stack_trace_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int stack_trace_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &stack_trace_seq_ops);
+
+ return ret;
+}
+
+static struct file_operations stack_trace_fops = {
+ .open = stack_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static __init int stack_trace_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
+ &max_stack_size, &stack_max_size_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+
+ entry = debugfs_create_file("stack_trace", 0444, d_tracer,
+ NULL, &stack_trace_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'stack_trace' entry\n");
+
+ register_ftrace_function(&trace_ops);
+
+ return 0;
+}
+
+device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 0000000..9587d3b
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,363 @@
+/*
+ * trace stack traces
+ *
+ * Copyright (C) 2004-2008, Soeren Sandmann
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/fs.h>
+
+#include <asm/stacktrace.h>
+
+#include "trace.h"
+
+static struct trace_array *sysprof_trace;
+static int __read_mostly tracer_enabled;
+
+/*
+ * 1 msec sample interval by default:
+ */
+static unsigned long sample_period = 1000000;
+static const unsigned int sample_max_depth = 512;
+
+static DEFINE_MUTEX(sample_timer_lock);
+/*
+ * Per CPU hrtimers that do the profiling:
+ */
+static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+struct backtrace_info {
+ struct trace_array_cpu *data;
+ struct trace_array *tr;
+ int pos;
+};
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+ /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+ /* Don't bother with IRQ stacks for now */
+ return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+ struct backtrace_info *info = data;
+
+ if (info->pos < sample_max_depth && reliable) {
+ __trace_special(info->tr, info->data, 1, addr, 0);
+
+ info->pos++;
+ }
+}
+
+const static struct stacktrace_ops backtrace_ops = {
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+};
+
+static int
+trace_kernel(struct pt_regs *regs, struct trace_array *tr,
+ struct trace_array_cpu *data)
+{
+ struct backtrace_info info;
+ unsigned long bp;
+ char *stack;
+
+ info.tr = tr;
+ info.data = data;
+ info.pos = 1;
+
+ __trace_special(info.tr, info.data, 1, regs->ip, 0);
+
+ stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+ bp = regs->bp;
+#else
+ bp = 0;
+#endif
+
+ dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
+
+ return info.pos;
+}
+
+static void timer_notify(struct pt_regs *regs, int cpu)
+{
+ struct trace_array_cpu *data;
+ struct stack_frame frame;
+ struct trace_array *tr;
+ const void __user *fp;
+ int is_user;
+ int i;
+
+ if (!regs)
+ return;
+
+ tr = sysprof_trace;
+ data = tr->data[cpu];
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ return;
+
+ if (is_user && current->state != TASK_RUNNING)
+ return;
+
+ __trace_special(tr, data, 0, 0, current->pid);
+
+ if (!is_user)
+ i = trace_kernel(regs, tr, data);
+ else
+ i = 0;
+
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm && i < sample_max_depth) {
+ regs = (struct pt_regs *)current->thread.sp0 - 1;
+
+ fp = (void __user *)regs->bp;
+
+ __trace_special(tr, data, 2, regs->ip, 0);
+
+ while (i < sample_max_depth) {
+ frame.next_fp = NULL;
+ frame.return_address = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+
+ __trace_special(tr, data, 2, frame.return_address,
+ (unsigned long)fp);
+ fp = frame.next_fp;
+
+ i++;
+ }
+
+ }
+
+ /*
+ * Special trace entry if we overflow the max depth:
+ */
+ if (i == sample_max_depth)
+ __trace_special(tr, data, -1, -1, -1);
+
+ __trace_special(tr, data, 3, current->pid, i);
+}
+
+static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
+{
+ /* trace here */
+ timer_notify(get_irq_regs(), smp_processor_id());
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+ return HRTIMER_RESTART;
+}
+
+static void start_stack_timer(int cpu)
+{
+ struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = stack_trace_timer_fn;
+ hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
+
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+}
+
+static void start_stack_timers(void)
+{
+ cpumask_t saved_mask = current->cpus_allowed;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+ start_stack_timer(cpu);
+ }
+ set_cpus_allowed_ptr(current, &saved_mask);
+}
+
+static void stop_stack_timer(int cpu)
+{
+ struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+ hrtimer_cancel(hrtimer);
+}
+
+static void stop_stack_timers(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ stop_stack_timer(cpu);
+}
+
+static void stack_reset(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
+static void start_stack_trace(struct trace_array *tr)
+{
+ mutex_lock(&sample_timer_lock);
+ stack_reset(tr);
+ start_stack_timers();
+ tracer_enabled = 1;
+ mutex_unlock(&sample_timer_lock);
+}
+
+static void stop_stack_trace(struct trace_array *tr)
+{
+ mutex_lock(&sample_timer_lock);
+ stop_stack_timers();
+ tracer_enabled = 0;
+ mutex_unlock(&sample_timer_lock);
+}
+
+static void stack_trace_init(struct trace_array *tr)
+{
+ sysprof_trace = tr;
+
+ if (tr->ctrl)
+ start_stack_trace(tr);
+}
+
+static void stack_trace_reset(struct trace_array *tr)
+{
+ if (tr->ctrl)
+ stop_stack_trace(tr);
+}
+
+static void stack_trace_ctrl_update(struct trace_array *tr)
+{
+ /* When starting a new trace, reset the buffers */
+ if (tr->ctrl)
+ start_stack_trace(tr);
+ else
+ stop_stack_trace(tr);
+}
+
+static struct tracer stack_trace __read_mostly =
+{
+ .name = "sysprof",
+ .init = stack_trace_init,
+ .reset = stack_trace_reset,
+ .ctrl_update = stack_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_sysprof,
+#endif
+};
+
+__init static int init_stack_trace(void)
+{
+ return register_tracer(&stack_trace);
+}
+device_initcall(init_stack_trace);
+
+#define MAX_LONG_DIGITS 22
+
+static ssize_t
+sysprof_sample_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_LONG_DIGITS];
+ int r;
+
+ r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+sysprof_sample_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_LONG_DIGITS];
+ unsigned long val;
+
+ if (cnt > MAX_LONG_DIGITS-1)
+ cnt = MAX_LONG_DIGITS-1;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ val = simple_strtoul(buf, NULL, 10);
+ /*
+ * Enforce a minimum sample period of 100 usecs:
+ */
+ if (val < 100)
+ val = 100;
+
+ mutex_lock(&sample_timer_lock);
+ stop_stack_timers();
+ sample_period = val * 1000;
+ start_stack_timers();
+ mutex_unlock(&sample_timer_lock);
+
+ return cnt;
+}
+
+static struct file_operations sysprof_sample_fops = {
+ .read = sysprof_sample_read,
+ .write = sysprof_sample_write,
+};
+
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
+{
+ struct dentry *entry;
+
+ entry = debugfs_create_file("sysprof_sample_period", 0644,
+ d_tracer, NULL, &sysprof_sample_fops);
+ if (entry)
+ return;
+ pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 0000000..af8c856
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2008 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/tracepoint.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+extern struct tracepoint __start___tracepoints[];
+extern struct tracepoint __stop___tracepoints[];
+
+/* Set to 1 to enable tracepoint debug output */
+static const int tracepoint_debug;
+
+/*
+ * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
+ * builtin and module tracepoints and the hash table.
+ */
+static DEFINE_MUTEX(tracepoints_mutex);
+
+/*
+ * Tracepoint hash table, containing the active tracepoints.
+ * Protected by tracepoints_mutex.
+ */
+#define TRACEPOINT_HASH_BITS 6
+#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+
+/*
+ * Note about RCU :
+ * It is used to to delay the free of multiple probes array until a quiescent
+ * state is reached.
+ * Tracepoint entries modifications are protected by the tracepoints_mutex.
+ */
+struct tracepoint_entry {
+ struct hlist_node hlist;
+ void **funcs;
+ int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ unsigned char rcu_pending:1;
+ char name[0];
+};
+
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct tracepoint_entry *entry = container_of(head,
+ struct tracepoint_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+{
+ if (!old)
+ return;
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu_sched(&entry->rcu, free_old_closure);
+}
+
+static void debug_print_probes(struct tracepoint_entry *entry)
+{
+ int i;
+
+ if (!tracepoint_debug)
+ return;
+
+ for (i = 0; entry->funcs[i]; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+}
+
+static void *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+{
+ int nr_probes = 0;
+ void **old, **new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->funcs;
+ if (old) {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes]; nr_probes++)
+ if (old[nr_probes] == probe)
+ return ERR_PTR(-EEXIST);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (old)
+ memcpy(new, old, nr_probes * sizeof(void *));
+ new[nr_probes] = probe;
+ entry->refcount = nr_probes + 1;
+ entry->funcs = new;
+ debug_print_probes(entry);
+ return old;
+}
+
+static void *
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ void **old, **new;
+
+ old = entry->funcs;
+
+ if (!old)
+ return NULL;
+
+ debug_print_probes(entry);
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes]; nr_probes++) {
+ if ((!probe || old[nr_probes] == probe))
+ nr_del++;
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->funcs = NULL;
+ entry->refcount = 0;
+ debug_print_probes(entry);
+ return old;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 0) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(void *), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i]; i++)
+ if ((probe && old[i] != probe))
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->funcs = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
+ * Get tracepoint if the tracepoint is present in the tracepoint hash table.
+ * Must be called with tracepoints_mutex held.
+ * Returns NULL if not present.
+ */
+static struct tracepoint_entry *get_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ u32 hash = jhash(name, strlen(name), 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name))
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * Add the tracepoint to the tracepoint hash table. Must be called with
+ * tracepoints_mutex held.
+ */
+static struct tracepoint_entry *add_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ size_t name_len = strlen(name) + 1;
+ u32 hash = jhash(name, name_len-1, 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ printk(KERN_NOTICE
+ "tracepoint %s busy\n", name);
+ return ERR_PTR(-EEXIST); /* Already there */
+ }
+ }
+ /*
+ * Using kmalloc here to allocate a variable length element. Could
+ * cause some memory fragmentation if overused.
+ */
+ e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+ memcpy(&e->name[0], name, name_len);
+ e->funcs = NULL;
+ e->refcount = 0;
+ e->rcu_pending = 0;
+ hlist_add_head(&e->hlist, head);
+ return e;
+}
+
+/*
+ * Remove the tracepoint from the tracepoint hash table. Must be called with
+ * mutex_lock held.
+ */
+static int remove_tracepoint(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct tracepoint_entry *e;
+ int found = 0;
+ size_t len = strlen(name) + 1;
+ u32 hash = jhash(name, len-1, 0);
+
+ head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+ if (e->refcount)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu_sched has been executed */
+ if (e->rcu_pending)
+ rcu_barrier_sched();
+ kfree(e);
+ return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one tracepoint.
+ */
+static void set_tracepoint(struct tracepoint_entry **entry,
+ struct tracepoint *elem, int active)
+{
+ WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+
+ /*
+ * rcu_assign_pointer has a smp_wmb() which makes sure that the new
+ * probe callbacks array is consistent before setting a pointer to it.
+ * This array is referenced by __DO_TRACE from
+ * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
+ * is used.
+ */
+ rcu_assign_pointer(elem->funcs, (*entry)->funcs);
+ elem->state = active;
+}
+
+/*
+ * Disable a tracepoint and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
+ */
+static void disable_tracepoint(struct tracepoint *elem)
+{
+ elem->state = 0;
+}
+
+/**
+ * tracepoint_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of tracepoints.
+ */
+void tracepoint_update_probe_range(struct tracepoint *begin,
+ struct tracepoint *end)
+{
+ struct tracepoint *iter;
+ struct tracepoint_entry *mark_entry;
+
+ mutex_lock(&tracepoints_mutex);
+ for (iter = begin; iter < end; iter++) {
+ mark_entry = get_tracepoint(iter->name);
+ if (mark_entry) {
+ set_tracepoint(&mark_entry, iter,
+ !!mark_entry->refcount);
+ } else {
+ disable_tracepoint(iter);
+ }
+ }
+ mutex_unlock(&tracepoints_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ */
+static void tracepoint_update_probes(void)
+{
+ /* Core kernel tracepoints */
+ tracepoint_update_probe_range(__start___tracepoints,
+ __stop___tracepoints);
+ /* tracepoints in modules. */
+ module_update_tracepoints();
+}
+
+/**
+ * tracepoint_probe_register - Connect a probe to a tracepoint
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int tracepoint_probe_register(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ int ret = 0;
+ void *old;
+
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry) {
+ entry = add_tracepoint(name);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto end;
+ }
+ }
+ /*
+ * If we detect that a call_rcu_sched is pending for this tracepoint,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = tracepoint_entry_add_probe(entry, probe);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
+ goto end;
+ }
+ mutex_unlock(&tracepoints_mutex);
+ tracepoint_update_probes(); /* may update entry */
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ WARN_ON(!entry);
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ tracepoint_entry_free_old(entry, old);
+end:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+
+/**
+ * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int tracepoint_probe_unregister(const char *name, void *probe)
+{
+ struct tracepoint_entry *entry;
+ void *old;
+ int ret = -ENOENT;
+
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ old = tracepoint_entry_remove_probe(entry, probe);
+ if (!old) {
+ printk(KERN_WARNING "Warning: Trying to unregister a probe"
+ "that doesn't exist\n");
+ goto end;
+ }
+ mutex_unlock(&tracepoints_mutex);
+ tracepoint_update_probes(); /* may update entry */
+ mutex_lock(&tracepoints_mutex);
+ entry = get_tracepoint(name);
+ if (!entry)
+ goto end;
+ if (entry->rcu_pending)
+ rcu_barrier_sched();
+ tracepoint_entry_free_old(entry, old);
+ remove_tracepoint(name); /* Ignore busy error message */
+ ret = 0;
+end:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+/**
+ * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
+ * @tracepoint: current tracepoints (in), next tracepoint (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next tracepoint has been found (1) or not (0).
+ * Will return the first tracepoint in the range if the input tracepoint is
+ * NULL.
+ */
+int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+ struct tracepoint *begin, struct tracepoint *end)
+{
+ if (!*tracepoint && begin != end) {
+ *tracepoint = begin;
+ return 1;
+ }
+ if (*tracepoint >= begin && *tracepoint < end)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+ int found = 0;
+
+ /* Core kernel tracepoints */
+ if (!iter->module) {
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ __start___tracepoints, __stop___tracepoints);
+ if (found)
+ goto end;
+ }
+ /* tracepoints in modules. */
+ found = module_get_iter_tracepoints(iter);
+end:
+ if (!found)
+ tracepoint_iter_reset(iter);
+}
+
+void tracepoint_iter_start(struct tracepoint_iter *iter)
+{
+ tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_start);
+
+void tracepoint_iter_next(struct tracepoint_iter *iter)
+{
+ iter->tracepoint++;
+ /*
+ * iter->tracepoint may be invalid because we blindly incremented it.
+ * Make sure it is valid by marshalling on the tracepoints, getting the
+ * tracepoints from following modules if necessary.
+ */
+ tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_next);
+
+void tracepoint_iter_stop(struct tracepoint_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
+
+void tracepoint_iter_reset(struct tracepoint_iter *iter)
+{
+ iter->module = NULL;
+ iter->tracepoint = NULL;
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 0000000..5275fba
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,153 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan, <jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+#include <linux/jiffies.h>
+
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ struct timespec uptime, ts;
+ u64 ac_etime;
+
+ BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+ /* calculate task elapsed time in timespec */
+ do_posix_clock_monotonic_gettime(&uptime);
+ ts = timespec_sub(uptime, tsk->start_time);
+ /* rebase elapsed time to usec (should never be negative) */
+ ac_etime = timespec_to_ns(&ts);
+ do_div(ac_etime, NSEC_PER_USEC);
+ stats->ac_etime = ac_etime;
+ stats->ac_btime = get_seconds() - ts.tv_sec;
+ if (thread_group_leader(tsk)) {
+ stats->ac_exitcode = tsk->exit_code;
+ if (tsk->flags & PF_FORKNOEXEC)
+ stats->ac_flag |= AFORK;
+ }
+ if (tsk->flags & PF_SUPERPRIV)
+ stats->ac_flag |= ASU;
+ if (tsk->flags & PF_DUMPCORE)
+ stats->ac_flag |= ACORE;
+ if (tsk->flags & PF_SIGNALED)
+ stats->ac_flag |= AXSIG;
+ stats->ac_nice = task_nice(tsk);
+ stats->ac_sched = tsk->policy;
+ stats->ac_uid = tsk->uid;
+ stats->ac_gid = tsk->gid;
+ stats->ac_pid = tsk->pid;
+ rcu_read_lock();
+ stats->ac_ppid = pid_alive(tsk) ?
+ rcu_dereference(tsk->real_parent)->tgid : 0;
+ rcu_read_unlock();
+ stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+ stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_utimescaled =
+ cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
+ stats->ac_stimescaled =
+ cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
+ stats->ac_minflt = tsk->min_flt;
+ stats->ac_majflt = tsk->maj_flt;
+
+ strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
+
+#ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+ struct mm_struct *mm;
+
+ /* convert pages-usec to Mbyte-usec */
+ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+ stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+ mm = get_task_mm(p);
+ if (mm) {
+ /* adjust to KB unit */
+ stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB;
+ stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
+ mmput(mm);
+ }
+ stats->read_char = p->ioac.rchar;
+ stats->write_char = p->ioac.wchar;
+ stats->read_syscalls = p->ioac.syscr;
+ stats->write_syscalls = p->ioac.syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ stats->read_bytes = p->ioac.read_bytes;
+ stats->write_bytes = p->ioac.write_bytes;
+ stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+ stats->read_bytes = 0;
+ stats->write_bytes = 0;
+ stats->cancelled_write_bytes = 0;
+#endif
+}
+#undef KB
+#undef MB
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ if (likely(tsk->mm)) {
+ cputime_t time, dtime;
+ struct timeval value;
+ unsigned long flags;
+ u64 delta;
+
+ local_irq_save(flags);
+ time = tsk->stime + tsk->utime;
+ dtime = cputime_sub(time, tsk->acct_timexpd);
+ jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+ delta = value.tv_sec;
+ delta = delta * USEC_PER_SEC + value.tv_usec;
+
+ if (delta == 0)
+ goto out;
+ tsk->acct_timexpd = time;
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ out:
+ local_irq_restore(flags);
+ }
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+ tsk->acct_timexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
+}
+#endif
diff --git a/kernel/uid16.c b/kernel/uid16.c
new file mode 100644
index 0000000..67bb92c
--- /dev/null
+++ b/kernel/uid16.c
@@ -0,0 +1,230 @@
+/*
+ * Wrapper functions for 16bit uid back compatibility. All nicely tied
+ * together in the faint hope we can take the out in five years time.
+ */
+
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
+{
+ long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(3, ret, filename, user, group);
+ return ret;
+}
+
+SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
+{
+ long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(3, ret, filename, user, group);
+ return ret;
+}
+
+SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
+{
+ long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(3, ret, fd, user, group);
+ return ret;
+}
+
+SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
+{
+ long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(2, ret, rgid, egid);
+ return ret;
+}
+
+SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
+{
+ long ret = sys_setgid(low2highgid(gid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(1, ret, gid);
+ return ret;
+}
+
+SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
+{
+ long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(2, ret, ruid, euid);
+ return ret;
+}
+
+SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
+{
+ long ret = sys_setuid(low2highuid(uid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(1, ret, uid);
+ return ret;
+}
+
+SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
+{
+ long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ low2highuid(suid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(3, ret, ruid, euid, suid);
+ return ret;
+}
+
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
+{
+ int retval;
+
+ if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
+ !(retval = put_user(high2lowuid(current->euid), euid)))
+ retval = put_user(high2lowuid(current->suid), suid);
+
+ return retval;
+}
+
+SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
+{
+ long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ low2highgid(sgid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(3, ret, rgid, egid, sgid);
+ return ret;
+}
+
+
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
+{
+ int retval;
+
+ if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
+ !(retval = put_user(high2lowgid(current->egid), egid)))
+ retval = put_user(high2lowgid(current->sgid), sgid);
+
+ return retval;
+}
+
+SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
+{
+ long ret = sys_setfsuid(low2highuid(uid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(1, ret, uid);
+ return ret;
+}
+
+SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
+{
+ long ret = sys_setfsgid(low2highgid(gid));
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(1, ret, gid);
+ return ret;
+}
+
+static int groups16_to_user(old_gid_t __user *grouplist,
+ struct group_info *group_info)
+{
+ int i;
+ old_gid_t group;
+
+ for (i = 0; i < group_info->ngroups; i++) {
+ group = high2lowgid(GROUP_AT(group_info, i));
+ if (put_user(group, grouplist+i))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int groups16_from_user(struct group_info *group_info,
+ old_gid_t __user *grouplist)
+{
+ int i;
+ old_gid_t group;
+
+ for (i = 0; i < group_info->ngroups; i++) {
+ if (get_user(group, grouplist+i))
+ return -EFAULT;
+ GROUP_AT(group_info, i) = low2highgid(group);
+ }
+
+ return 0;
+}
+
+SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
+{
+ int i = 0;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ get_group_info(current->group_info);
+ i = current->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups16_to_user(grouplist, current->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ put_group_info(current->group_info);
+ return i;
+}
+
+SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups16_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+SYSCALL_DEFINE0(getuid16)
+{
+ return high2lowuid(current->uid);
+}
+
+SYSCALL_DEFINE0(geteuid16)
+{
+ return high2lowuid(current->euid);
+}
+
+SYSCALL_DEFINE0(getgid16)
+{
+ return high2lowgid(current->gid);
+}
+
+SYSCALL_DEFINE0(getegid16)
+{
+ return high2lowgid(current->egid);
+}
diff --git a/kernel/user.c b/kernel/user.c
new file mode 100644
index 0000000..39d6159
--- /dev/null
+++ b/kernel/user.c
@@ -0,0 +1,526 @@
+/*
+ * The "user cache".
+ *
+ * (C) Copyright 1991-2000 Linus Torvalds
+ *
+ * We have a per-user structure to keep track of how many
+ * processes, files etc the user has claimed, in order to be
+ * able to have per-user limits for system resources.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/key.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>
+
+struct user_namespace init_user_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .root_user = &root_user,
+};
+EXPORT_SYMBOL_GPL(init_user_ns);
+
+/*
+ * UID task count cache, to get fast user lookup in "alloc_uid"
+ * when changing user ID's (ie setuid() and friends).
+ */
+
+#define UIDHASH_MASK (UIDHASH_SZ - 1)
+#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
+#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
+
+static struct kmem_cache *uid_cachep;
+
+/*
+ * The uidhash_lock is mostly taken from process context, but it is
+ * occasionally also taken from softirq/tasklet context, when
+ * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+ * But free_uid() is also called with local interrupts disabled, and running
+ * local_bh_enable() with local interrupts disabled is an error - we'll run
+ * softirq callbacks, and they can unconditionally enable interrupts, and
+ * the caller of free_uid() didn't expect that..
+ */
+static DEFINE_SPINLOCK(uidhash_lock);
+
+struct user_struct root_user = {
+ .__count = ATOMIC_INIT(1),
+ .processes = ATOMIC_INIT(1),
+ .files = ATOMIC_INIT(0),
+ .sigpending = ATOMIC_INIT(0),
+ .locked_shm = 0,
+#ifdef CONFIG_USER_SCHED
+ .tg = &init_task_group,
+#endif
+};
+
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
+{
+ hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static void uid_hash_remove(struct user_struct *up)
+{
+ hlist_del_init(&up->uidhash_node);
+}
+
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ atomic_inc(&user->__count);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_USER_SCHED
+
+static void sched_destroy_user(struct user_struct *up)
+{
+ sched_destroy_group(up->tg);
+}
+
+static int sched_create_user(struct user_struct *up)
+{
+ int rc = 0;
+
+ up->tg = sched_create_group(&root_task_group);
+ if (IS_ERR(up->tg))
+ rc = -ENOMEM;
+
+ return rc;
+}
+
+static void sched_switch_user(struct task_struct *p)
+{
+ sched_move_task(p);
+}
+
+#else /* CONFIG_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+
+#endif /* CONFIG_USER_SCHED */
+
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+
+static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
+static DEFINE_MUTEX(uids_mutex);
+
+static inline void uids_mutex_lock(void)
+{
+ mutex_lock(&uids_mutex);
+}
+
+static inline void uids_mutex_unlock(void)
+{
+ mutex_unlock(&uids_mutex);
+}
+
+/* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static ssize_t cpu_shares_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
+}
+
+static ssize_t cpu_shares_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long shares;
+ int rc;
+
+ sscanf(buf, "%lu", &shares);
+
+ rc = sched_group_set_shares(up->tg, shares);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_share_attr =
+ __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long rt_runtime;
+ int rc;
+
+ sscanf(buf, "%ld", &rt_runtime);
+
+ rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+ __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+
+static ssize_t cpu_rt_period_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
+}
+
+static ssize_t cpu_rt_period_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long rt_period;
+ int rc;
+
+ sscanf(buf, "%lu", &rt_period);
+
+ rc = sched_group_set_rt_period(up->tg, rt_period);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_period_attr =
+ __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
+#endif
+
+/* default attributes per uid directory */
+static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ &cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ &cpu_rt_runtime_attr.attr,
+ &cpu_rt_period_attr.attr,
+#endif
+ NULL
+};
+
+/* the lifetime of user_struct is not managed by the core (now) */
+static void uids_release(struct kobject *kobj)
+{
+ return;
+}
+
+static struct kobj_type uids_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = uids_attributes,
+ .release = uids_release,
+};
+
+/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
+static int uids_user_create(struct user_struct *up)
+{
+ struct kobject *kobj = &up->kobj;
+ int error;
+
+ memset(kobj, 0, sizeof(struct kobject));
+ kobj->kset = uids_kset;
+ error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
+ if (error) {
+ kobject_put(kobj);
+ goto done;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+done:
+ return error;
+}
+
+/* create these entries in sysfs:
+ * "/sys/kernel/uids" directory
+ * "/sys/kernel/uids/0" directory (for root user)
+ * "/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_sysfs_init(void)
+{
+ uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
+ if (!uids_kset)
+ return -ENOMEM;
+
+ return uids_user_create(&root_user);
+}
+
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
+{
+ struct user_struct *up = container_of(w, struct user_struct, work);
+ unsigned long flags;
+ int remove_user = 0;
+
+ /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+ * atomic.
+ */
+ uids_mutex_lock();
+
+ local_irq_save(flags);
+
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ uid_hash_remove(up);
+ remove_user = 1;
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+
+ if (!remove_user)
+ goto done;
+
+ kobject_uevent(&up->kobj, KOBJ_REMOVE);
+ kobject_del(&up->kobj);
+ kobject_put(&up->kobj);
+
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+
+done:
+ uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ /* restore back the count */
+ atomic_inc(&up->__count);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+
+ INIT_WORK(&up->work, remove_user_sysfs_dir);
+ schedule_work(&up->work);
+}
+
+#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+
+int uids_sysfs_init(void) { return 0; }
+static inline int uids_user_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ uid_hash_remove(up);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+}
+
+#endif
+
+/*
+ * Locate the user_struct for the passed UID. If found, take a ref on it. The
+ * caller must undo that ref with free_uid().
+ *
+ * If the user_struct could not be found, return NULL.
+ */
+struct user_struct *find_user(uid_t uid)
+{
+ struct user_struct *ret;
+ unsigned long flags;
+ struct user_namespace *ns = current->nsproxy->user_ns;
+
+ spin_lock_irqsave(&uidhash_lock, flags);
+ ret = uid_hash_find(uid, uidhashentry(ns, uid));
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ return ret;
+}
+
+void free_uid(struct user_struct *up)
+{
+ unsigned long flags;
+
+ if (!up)
+ return;
+
+ local_irq_save(flags);
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ free_user(up, flags);
+ else
+ local_irq_restore(flags);
+}
+
+struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+{
+ struct hlist_head *hashent = uidhashentry(ns, uid);
+ struct user_struct *up, *new;
+
+ /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
+ * atomic.
+ */
+ uids_mutex_lock();
+
+ spin_lock_irq(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ spin_unlock_irq(&uidhash_lock);
+
+ if (!up) {
+ new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
+ if (!new)
+ goto out_unlock;
+
+ new->uid = uid;
+ atomic_set(&new->__count, 1);
+
+ if (sched_create_user(new) < 0)
+ goto out_free_user;
+
+ if (uids_user_create(new))
+ goto out_destoy_sched;
+
+ /*
+ * Before adding this, check whether we raced
+ * on adding the same user already..
+ */
+ spin_lock_irq(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ if (up) {
+ /* This case is not possible when CONFIG_USER_SCHED
+ * is defined, since we serialize alloc_uid() using
+ * uids_mutex. Hence no need to call
+ * sched_destroy_user() or remove_user_sysfs_dir().
+ */
+ key_put(new->uid_keyring);
+ key_put(new->session_keyring);
+ kmem_cache_free(uid_cachep, new);
+ } else {
+ uid_hash_insert(new, hashent);
+ up = new;
+ }
+ spin_unlock_irq(&uidhash_lock);
+
+ }
+
+ uids_mutex_unlock();
+
+ return up;
+
+out_destoy_sched:
+ sched_destroy_user(new);
+out_free_user:
+ kmem_cache_free(uid_cachep, new);
+out_unlock:
+ uids_mutex_unlock();
+ return NULL;
+}
+
+void switch_uid(struct user_struct *new_user)
+{
+ struct user_struct *old_user;
+
+ /* What if a process setreuid()'s and this brings the
+ * new uid over his NPROC rlimit? We can check this now
+ * cheaply with the new uid cache, so if it matters
+ * we should be checking for it. -DaveM
+ */
+ old_user = current->user;
+ atomic_inc(&new_user->processes);
+ atomic_dec(&old_user->processes);
+ switch_uid_keyring(new_user);
+ current->user = new_user;
+ sched_switch_user(current);
+
+ /*
+ * We need to synchronize with __sigqueue_alloc()
+ * doing a get_uid(p->user).. If that saw the old
+ * user value, we need to wait until it has exited
+ * its critical region before we can free the old
+ * structure.
+ */
+ smp_mb();
+ spin_unlock_wait(&current->sighand->siglock);
+
+ free_uid(old_user);
+ suid_keys(current);
+}
+
+#ifdef CONFIG_USER_NS
+void release_uids(struct user_namespace *ns)
+{
+ int i;
+ unsigned long flags;
+ struct hlist_head *head;
+ struct hlist_node *nd;
+
+ spin_lock_irqsave(&uidhash_lock, flags);
+ /*
+ * collapse the chains so that the user_struct-s will
+ * be still alive, but not in hashes. subsequent free_uid()
+ * will free them.
+ */
+ for (i = 0; i < UIDHASH_SZ; i++) {
+ head = ns->uidhash_table + i;
+ while (!hlist_empty(head)) {
+ nd = head->first;
+ hlist_del_init(nd);
+ }
+ }
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+
+ free_uid(ns->root_user);
+}
+#endif
+
+static int __init uid_cache_init(void)
+{
+ int n;
+
+ uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ for(n = 0; n < UIDHASH_SZ; ++n)
+ INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+
+ /* Insert the root user immediately (init already runs as root) */
+ spin_lock_irq(&uidhash_lock);
+ uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
+ spin_unlock_irq(&uidhash_lock);
+
+ return 0;
+}
+
+module_init(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 0000000..532858f
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,76 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+
+/*
+ * Clone a new ns copying an original user ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+{
+ struct user_namespace *ns;
+ struct user_struct *new_user;
+ int n;
+
+ ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ if (!ns)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&ns->kref);
+
+ for (n = 0; n < UIDHASH_SZ; ++n)
+ INIT_HLIST_HEAD(ns->uidhash_table + n);
+
+ /* Insert new root user. */
+ ns->root_user = alloc_uid(ns, 0);
+ if (!ns->root_user) {
+ kfree(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Reset current->user with a new one */
+ new_user = alloc_uid(ns, current->uid);
+ if (!new_user) {
+ free_uid(ns->root_user);
+ kfree(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ switch_uid(new_user);
+ return ns;
+}
+
+struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
+{
+ struct user_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ get_user_ns(old_ns);
+
+ if (!(flags & CLONE_NEWUSER))
+ return old_ns;
+
+ new_ns = clone_user_ns(old_ns);
+
+ put_user_ns(old_ns);
+ return new_ns;
+}
+
+void free_user_ns(struct kref *kref)
+{
+ struct user_namespace *ns;
+
+ ns = container_of(kref, struct user_namespace, kref);
+ release_uids(ns);
+ kfree(ns);
+}
+EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 0000000..815237a
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+{
+ struct uts_namespace *ns;
+
+ ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (!ns)
+ return ERR_PTR(-ENOMEM);
+
+ down_read(&uts_sem);
+ memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ up_read(&uts_sem);
+ kref_init(&ns->kref);
+ return ns;
+}
+
+/*
+ * Copy task tsk's utsname namespace, or clone it if flags
+ * specifies CLONE_NEWUTS. In latter case, changes to the
+ * utsname of this process won't be seen by parent, and vice
+ * versa.
+ */
+struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+{
+ struct uts_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ get_uts_ns(old_ns);
+
+ if (!(flags & CLONE_NEWUTS))
+ return old_ns;
+
+ new_ns = clone_uts_ns(old_ns);
+
+ put_uts_ns(old_ns);
+ return new_ns;
+}
+
+void free_uts_ns(struct kref *kref)
+{
+ struct uts_namespace *ns;
+
+ ns = container_of(kref, struct uts_namespace, kref);
+ kfree(ns);
+}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
new file mode 100644
index 0000000..3b34b35
--- /dev/null
+++ b/kernel/utsname_sysctl.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2007
+ *
+ * Author: Eric Biederman <ebiederm@xmision.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+
+static void *get_uts(ctl_table *table, int write)
+{
+ char *which = table->data;
+ struct uts_namespace *uts_ns;
+
+ uts_ns = current->nsproxy->uts_ns;
+ which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+
+ if (!write)
+ down_read(&uts_sem);
+ else
+ down_write(&uts_sem);
+ return which;
+}
+
+static void put_uts(ctl_table *table, int write, void *which)
+{
+ if (!write)
+ up_read(&uts_sem);
+ else
+ up_write(&uts_sem);
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Special case of dostring for the UTS structure. This has locks
+ * to observe. Should this be in kernel/sys.c ????
+ */
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table uts_table;
+ int r;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+#else
+#define proc_do_uts_string NULL
+#endif
+
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table uts_table;
+ int r, write;
+ write = newval && newlen;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+#else
+#define sysctl_uts_string NULL
+#endif
+
+static struct ctl_table uts_kern_table[] = {
+ {
+ .ctl_name = KERN_OSTYPE,
+ .procname = "ostype",
+ .data = init_uts_ns.name.sysname,
+ .maxlen = sizeof(init_uts_ns.name.sysname),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_OSRELEASE,
+ .procname = "osrelease",
+ .data = init_uts_ns.name.release,
+ .maxlen = sizeof(init_uts_ns.name.release),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_VERSION,
+ .procname = "version",
+ .data = init_uts_ns.name.version,
+ .maxlen = sizeof(init_uts_ns.name.version),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_NODENAME,
+ .procname = "hostname",
+ .data = init_uts_ns.name.nodename,
+ .maxlen = sizeof(init_uts_ns.name.nodename),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_DOMAINNAME,
+ .procname = "domainname",
+ .data = init_uts_ns.name.domainname,
+ .maxlen = sizeof(init_uts_ns.name.domainname),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {}
+};
+
+static struct ctl_table uts_root_table[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = uts_kern_table,
+ },
+ {}
+};
+
+static int __init utsname_sysctl_init(void)
+{
+ register_sysctl_table(uts_root_table);
+ return 0;
+}
+
+__initcall(utsname_sysctl_init);
diff --git a/kernel/wait.c b/kernel/wait.c
new file mode 100644
index 0000000..42a2dbc
--- /dev/null
+++ b/kernel/wait.c
@@ -0,0 +1,288 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 William Irwin, Oracle
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+ spin_lock_init(&q->lock);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(init_waitqueue_head);
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue_tail(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __remove_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue_tail(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+/*
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ /*
+ * We can check for list emptiness outside the lock
+ * IFF:
+ * - we use the "careful" check that verifies both
+ * the next and prev pointers, so that there cannot
+ * be any half-pending updates in progress on other
+ * CPU's that we haven't seen yet (and that might
+ * still change the stack area.
+ * and
+ * - all other users take the lock (ie we can only
+ * have _one_ other CPU that looks at or modifies
+ * the list).
+ */
+ if (!list_empty_careful(&wait->task_list)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_wait);
+
+/*
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @state: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and noone wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_common(q, mode, 1, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+
+ if (ret)
+ list_del_init(&wait->task_list);
+ return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+ else
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ if (test_bit(q->key.bit_nr, q->key.flags))
+ ret = (*action)(q->key.flags);
+ } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ do {
+ int ret;
+
+ prepare_to_wait_exclusive(wq, &q->wait, mode);
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
+ } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+ finish_wait(wq, &q->wait);
+ return 0;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ const struct zone *zone = page_zone(virt_to_page(word));
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
+}
+EXPORT_SYMBOL(bit_waitqueue);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
new file mode 100644
index 0000000..d4dc69d
--- /dev/null
+++ b/kernel/workqueue.c
@@ -0,0 +1,1026 @@
+/*
+ * linux/kernel/workqueue.c
+ *
+ * Generic mechanism for defining kernel helper threads for running
+ * arbitrary tasks in process context.
+ *
+ * Started by Ingo Molnar, Copyright (C) 2002
+ *
+ * Derived from the taskqueue/keventd code by:
+ *
+ * David Woodhouse <dwmw2@infradead.org>
+ * Andrew Morton
+ * Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ * Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/completion.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
+#include <linux/mempolicy.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+#include <linux/lockdep.h>
+
+/*
+ * The per-CPU workqueue (if single thread, we always use the first
+ * possible cpu).
+ */
+struct cpu_workqueue_struct {
+
+ spinlock_t lock;
+
+ struct list_head worklist;
+ wait_queue_head_t more_work;
+ struct work_struct *current_work;
+
+ struct workqueue_struct *wq;
+ struct task_struct *thread;
+
+ int run_depth; /* Detect run_workqueue() recursion depth */
+} ____cacheline_aligned;
+
+/*
+ * The externally visible workqueue abstraction is an array of
+ * per-CPU workqueues:
+ */
+struct workqueue_struct {
+ struct cpu_workqueue_struct *cpu_wq;
+ struct list_head list;
+ const char *name;
+ int singlethread;
+ int freezeable; /* Freeze threads during suspend */
+ int rt;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map lockdep_map;
+#endif
+};
+
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
+static LIST_HEAD(workqueues);
+
+static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
+/*
+ * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
+ * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
+ * which comes in between can't use for_each_online_cpu(). We could
+ * use cpu_possible_map, the cpumask below is more a documentation
+ * than optimization.
+ */
+static cpumask_t cpu_populated_map __read_mostly;
+
+/* If it's single threaded, it isn't in the list of workqueues. */
+static inline int is_single_threaded(struct workqueue_struct *wq)
+{
+ return wq->singlethread;
+}
+
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+ return is_single_threaded(wq)
+ ? &cpu_singlethread_map : &cpu_populated_map;
+}
+
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+ if (unlikely(is_single_threaded(wq)))
+ cpu = singlethread_cpu;
+ return per_cpu_ptr(wq->cpu_wq, cpu);
+}
+
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
+static inline void set_wq_data(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq)
+{
+ unsigned long new;
+
+ BUG_ON(!work_pending(work));
+
+ new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
+ new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
+ atomic_long_set(&work->data, new);
+}
+
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+{
+ return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
+}
+
+static void insert_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work, struct list_head *head)
+{
+ set_wq_data(work, cwq);
+ /*
+ * Ensure that we get the right work->data if we see the
+ * result of list_add() below, see try_to_grab_pending().
+ */
+ smp_wmb();
+ list_add_tail(&work->entry, head);
+ wake_up(&cwq->more_work);
+}
+
+static void __queue_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cwq->lock, flags);
+ insert_work(cwq, work, &cwq->worklist);
+ spin_unlock_irqrestore(&cwq->lock, flags);
+}
+
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
+ */
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+ int ret;
+
+ ret = queue_work_on(get_cpu(), wq, work);
+ put_cpu();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work);
+
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+{
+ int ret = 0;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+ BUG_ON(!list_empty(&work->entry));
+ __queue_work(wq_per_cpu(wq, cpu), work);
+ ret = 1;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_on);
+
+static void delayed_work_timer_fn(unsigned long __data)
+{
+ struct delayed_work *dwork = (struct delayed_work *)__data;
+ struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+ struct workqueue_struct *wq = cwq->wq;
+
+ __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+}
+
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @dwork: delayable work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ */
+int queue_delayed_work(struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ if (delay == 0)
+ return queue_work(wq, &dwork->work);
+
+ return queue_delayed_work_on(-1, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ */
+int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ int ret = 0;
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+ BUG_ON(timer_pending(timer));
+ BUG_ON(!list_empty(&work->entry));
+
+ timer_stats_timer_set_start_info(&dwork->timer);
+
+ /* This stores cwq for the moment, for the timer_fn */
+ set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+ timer->expires = jiffies + delay;
+ timer->data = (unsigned long)dwork;
+ timer->function = delayed_work_timer_fn;
+
+ if (unlikely(cpu >= 0))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
+ ret = 1;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+
+static void run_workqueue(struct cpu_workqueue_struct *cwq)
+{
+ spin_lock_irq(&cwq->lock);
+ cwq->run_depth++;
+ if (cwq->run_depth > 3) {
+ /* morton gets to eat his hat */
+ printk("%s: recursion depth exceeded: %d\n",
+ __func__, cwq->run_depth);
+ dump_stack();
+ }
+ while (!list_empty(&cwq->worklist)) {
+ struct work_struct *work = list_entry(cwq->worklist.next,
+ struct work_struct, entry);
+ work_func_t f = work->func;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the struct work_struct
+ * from inside the function that is called from it,
+ * this we need to take into account for lockdep too.
+ * To avoid bogus "held lock freed" warnings as well
+ * as problems when looking into work->lockdep_map,
+ * make a copy and use that here.
+ */
+ struct lockdep_map lockdep_map = work->lockdep_map;
+#endif
+
+ cwq->current_work = work;
+ list_del_init(cwq->worklist.next);
+ spin_unlock_irq(&cwq->lock);
+
+ BUG_ON(get_wq_data(work) != cwq);
+ work_clear_pending(work);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire(&lockdep_map);
+ f(work);
+ lock_map_release(&lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
+
+ if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+ printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(),
+ task_pid_nr(current));
+ printk(KERN_ERR " last function: ");
+ print_symbol("%s\n", (unsigned long)f);
+ debug_show_held_locks(current);
+ dump_stack();
+ }
+
+ spin_lock_irq(&cwq->lock);
+ cwq->current_work = NULL;
+ }
+ cwq->run_depth--;
+ spin_unlock_irq(&cwq->lock);
+}
+
+static int worker_thread(void *__cwq)
+{
+ struct cpu_workqueue_struct *cwq = __cwq;
+ DEFINE_WAIT(wait);
+
+ if (cwq->wq->freezeable)
+ set_freezable();
+
+ set_user_nice(current, -5);
+
+ for (;;) {
+ prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+ if (!freezing(current) &&
+ !kthread_should_stop() &&
+ list_empty(&cwq->worklist))
+ schedule();
+ finish_wait(&cwq->more_work, &wait);
+
+ try_to_freeze();
+
+ if (kthread_should_stop())
+ break;
+
+ run_workqueue(cwq);
+ }
+
+ return 0;
+}
+
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, struct list_head *head)
+{
+ INIT_WORK(&barr->work, wq_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+ init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, head);
+}
+
+static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+{
+ int active;
+
+ if (cwq->thread == current) {
+ /*
+ * Probably keventd trying to flush its own queue. So simply run
+ * it by hand rather than deadlocking.
+ */
+ run_workqueue(cwq);
+ active = 1;
+ } else {
+ struct wq_barrier barr;
+
+ active = 0;
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, &cwq->worklist);
+ active = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (active)
+ wait_for_completion(&barr.done);
+ }
+
+ return active;
+}
+
+/**
+ * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
+ *
+ * Forces execution of the workqueue and blocks until its completion.
+ * This is typically used in driver shutdown handlers.
+ *
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
+ *
+ * This function used to run the workqueues itself. Now we just wait for the
+ * helper threads to do it.
+ */
+void flush_workqueue(struct workqueue_struct *wq)
+{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ int cpu;
+
+ might_sleep();
+ lock_map_acquire(&wq->lockdep_map);
+ lock_map_release(&wq->lockdep_map);
+ for_each_cpu_mask_nr(cpu, *cpu_map)
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+}
+EXPORT_SYMBOL_GPL(flush_workqueue);
+
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns false if @work has already terminated.
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ struct list_head *prev;
+ struct wq_barrier barr;
+
+ might_sleep();
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return 0;
+
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
+
+ prev = NULL;
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * See the comment near try_to_grab_pending()->smp_rmb().
+ * If it was re-queued under us we are not going to wait.
+ */
+ smp_rmb();
+ if (unlikely(cwq != get_wq_data(work)))
+ goto out;
+ prev = &work->entry;
+ } else {
+ if (cwq->current_work != work)
+ goto out;
+ prev = &cwq->worklist;
+ }
+ insert_wq_barrier(cwq, &barr, prev->next);
+out:
+ spin_unlock_irq(&cwq->lock);
+ if (!prev)
+ return 0;
+
+ wait_for_completion(&barr.done);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
+/*
+ * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ int ret = -1;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ return 0;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return ret;
+
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong cwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (cwq == get_wq_data(work)) {
+ list_del_init(&work->entry);
+ ret = 1;
+ }
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ return ret;
+}
+
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work)
+{
+ struct wq_barrier barr;
+ int running = 0;
+
+ spin_lock_irq(&cwq->lock);
+ if (unlikely(cwq->current_work == work)) {
+ insert_wq_barrier(cwq, &barr, cwq->worklist.next);
+ running = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (unlikely(running))
+ wait_for_completion(&barr.done);
+}
+
+static void wait_on_work(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ struct workqueue_struct *wq;
+ const cpumask_t *cpu_map;
+ int cpu;
+
+ might_sleep();
+
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return;
+
+ wq = cwq->wq;
+ cpu_map = wq_cpu_map(wq);
+
+ for_each_cpu_mask_nr(cpu, *cpu_map)
+ wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+}
+
+static int __cancel_work_timer(struct work_struct *work,
+ struct timer_list* timer)
+{
+ int ret;
+
+ do {
+ ret = (timer && likely(del_timer(timer)));
+ if (!ret)
+ ret = try_to_grab_pending(work);
+ wait_on_work(work);
+ } while (unlikely(ret < 0));
+
+ work_clear_pending(work);
+ return ret;
+}
+
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns true if @work was pending.
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+int cancel_work_sync(struct work_struct *work)
+{
+ return __cancel_work_timer(work, NULL);
+}
+EXPORT_SYMBOL_GPL(cancel_work_sync);
+
+/**
+ * cancel_delayed_work_sync - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
+ *
+ * Returns true if @dwork was pending.
+ *
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ */
+int cancel_delayed_work_sync(struct delayed_work *dwork)
+{
+ return __cancel_work_timer(&dwork->work, &dwork->timer);
+}
+EXPORT_SYMBOL(cancel_delayed_work_sync);
+
+static struct workqueue_struct *keventd_wq __read_mostly;
+
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
+int schedule_work(struct work_struct *work)
+{
+ return queue_work(keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work);
+
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+ return queue_work_on(cpu, keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
+
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
+int schedule_delayed_work(struct delayed_work *dwork,
+ unsigned long delay)
+{
+ return queue_delayed_work(keventd_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work);
+
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
+int schedule_delayed_work_on(int cpu,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work_on);
+
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+ int cpu;
+ struct work_struct *works;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works)
+ return -ENOMEM;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, func);
+ schedule_work_on(cpu, work);
+ }
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+ put_online_cpus();
+ free_percpu(works);
+ return 0;
+}
+
+void flush_scheduled_work(void)
+{
+ flush_workqueue(keventd_wq);
+}
+EXPORT_SYMBOL(flush_scheduled_work);
+
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn: the function to execute
+ * @ew: guaranteed storage for the execute work structure (must
+ * be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns: 0 - function was executed
+ * 1 - function was scheduled for execution
+ */
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
+{
+ if (!in_interrupt()) {
+ fn(&ew->work);
+ return 0;
+ }
+
+ INIT_WORK(&ew->work, fn);
+ schedule_work(&ew->work);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
+int keventd_up(void)
+{
+ return keventd_wq != NULL;
+}
+
+int current_is_keventd(void)
+{
+ struct cpu_workqueue_struct *cwq;
+ int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
+ int ret = 0;
+
+ BUG_ON(!keventd_wq);
+
+ cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
+ if (current == cwq->thread)
+ ret = 1;
+
+ return ret;
+
+}
+
+static struct cpu_workqueue_struct *
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
+{
+ struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+ cwq->wq = wq;
+ spin_lock_init(&cwq->lock);
+ INIT_LIST_HEAD(&cwq->worklist);
+ init_waitqueue_head(&cwq->more_work);
+
+ return cwq;
+}
+
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ struct workqueue_struct *wq = cwq->wq;
+ const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+ struct task_struct *p;
+
+ p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+ /*
+ * Nobody can add the work_struct to this cwq,
+ * if (caller is __create_workqueue)
+ * nobody should see this wq
+ * else // caller is CPU_UP_PREPARE
+ * cpu is not on cpu_online_map
+ * so we can abort safely.
+ */
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (cwq->wq->rt)
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+ cwq->thread = p;
+
+ return 0;
+}
+
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct task_struct *p = cwq->thread;
+
+ if (p != NULL) {
+ if (cpu >= 0)
+ kthread_bind(p, cpu);
+ wake_up_process(p);
+ }
+}
+
+struct workqueue_struct *__create_workqueue_key(const char *name,
+ int singlethread,
+ int freezeable,
+ int rt,
+ struct lock_class_key *key,
+ const char *lock_name)
+{
+ struct workqueue_struct *wq;
+ struct cpu_workqueue_struct *cwq;
+ int err = 0, cpu;
+
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return NULL;
+
+ wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+ if (!wq->cpu_wq) {
+ kfree(wq);
+ return NULL;
+ }
+
+ wq->name = name;
+ lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+ wq->singlethread = singlethread;
+ wq->freezeable = freezeable;
+ wq->rt = rt;
+ INIT_LIST_HEAD(&wq->list);
+
+ if (singlethread) {
+ cwq = init_cpu_workqueue(wq, singlethread_cpu);
+ err = create_workqueue_thread(cwq, singlethread_cpu);
+ start_workqueue_thread(cwq, -1);
+ } else {
+ cpu_maps_update_begin();
+ /*
+ * We must place this wq on list even if the code below fails.
+ * cpu_down(cpu) can remove cpu from cpu_populated_map before
+ * destroy_workqueue() takes the lock, in that case we leak
+ * cwq[cpu]->thread.
+ */
+ spin_lock(&workqueue_lock);
+ list_add(&wq->list, &workqueues);
+ spin_unlock(&workqueue_lock);
+ /*
+ * We must initialize cwqs for each possible cpu even if we
+ * are going to call destroy_workqueue() finally. Otherwise
+ * cpu_up() can hit the uninitialized cwq once we drop the
+ * lock.
+ */
+ for_each_possible_cpu(cpu) {
+ cwq = init_cpu_workqueue(wq, cpu);
+ if (err || !cpu_online(cpu))
+ continue;
+ err = create_workqueue_thread(cwq, cpu);
+ start_workqueue_thread(cwq, cpu);
+ }
+ cpu_maps_update_done();
+ }
+
+ if (err) {
+ destroy_workqueue(wq);
+ wq = NULL;
+ }
+ return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue_key);
+
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
+{
+ /*
+ * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
+ * cpu_add_remove_lock protects cwq->thread.
+ */
+ if (cwq->thread == NULL)
+ return;
+
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
+
+ flush_cpu_workqueue(cwq);
+ /*
+ * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
+ * a concurrent flush_workqueue() can insert a barrier after us.
+ * However, in that case run_workqueue() won't return and check
+ * kthread_should_stop() until it flushes all work_struct's.
+ * When ->worklist becomes empty it is safe to exit because no
+ * more work_structs can be queued on this cwq: flush_workqueue
+ * checks list_empty(), and a "normal" queue_work() can't use
+ * a dead CPU.
+ */
+ kthread_stop(cwq->thread);
+ cwq->thread = NULL;
+}
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ int cpu;
+
+ cpu_maps_update_begin();
+ spin_lock(&workqueue_lock);
+ list_del(&wq->list);
+ spin_unlock(&workqueue_lock);
+
+ for_each_cpu_mask_nr(cpu, *cpu_map)
+ cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
+ cpu_maps_update_done();
+
+ free_percpu(wq->cpu_wq);
+ kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_workqueue_struct *cwq;
+ struct workqueue_struct *wq;
+ int ret = NOTIFY_OK;
+
+ action &= ~CPU_TASKS_FROZEN;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ cpu_set(cpu, cpu_populated_map);
+ }
+undo:
+ list_for_each_entry(wq, &workqueues, list) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!create_workqueue_thread(cwq, cpu))
+ break;
+ printk(KERN_ERR "workqueue [%s] for %i failed\n",
+ wq->name, cpu);
+ action = CPU_UP_CANCELED;
+ ret = NOTIFY_BAD;
+ goto undo;
+
+ case CPU_ONLINE:
+ start_workqueue_thread(cwq, cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ start_workqueue_thread(cwq, -1);
+ case CPU_POST_DEAD:
+ cleanup_workqueue_thread(cwq);
+ break;
+ }
+ }
+
+ switch (action) {
+ case CPU_UP_CANCELED:
+ case CPU_POST_DEAD:
+ cpu_clear(cpu, cpu_populated_map);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+ struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc;
+
+ INIT_WORK(&wfc.work, do_work_for_cpu);
+ wfc.fn = fn;
+ wfc.arg = arg;
+ get_online_cpus();
+ if (unlikely(!cpu_online(cpu)))
+ wfc.ret = -EINVAL;
+ else {
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ }
+ put_online_cpus();
+
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
+void __init init_workqueues(void)
+{
+ cpu_populated_map = cpu_online_map;
+ singlethread_cpu = first_cpu(cpu_possible_map);
+ cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
+ hotcpu_notifier(workqueue_cpu_callback, 0);
+ keventd_wq = create_workqueue("events");
+ BUG_ON(!keventd_wq);
+}
OpenPOWER on IntegriCloud