diff options
51 files changed, 595 insertions, 457 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index d882f80..dcff4d0 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -21,7 +21,7 @@ Description: these states. What: /sys/power/disk -Date: August 2006 +Date: September 2006 Contact: Rafael J. Wysocki <rjw@sisk.pl> Description: The /sys/power/disk file controls the operating mode of the @@ -39,6 +39,19 @@ Description: 'reboot' - the memory image will be saved by the kernel and the system will be rebooted. + Additionally, /sys/power/disk can be used to turn on one of the + two testing modes of the suspend-to-disk mechanism: 'testproc' + or 'test'. If the suspend-to-disk mechanism is in the + 'testproc' mode, writing 'disk' to /sys/power/state will cause + the kernel to disable nonboot CPUs and freeze tasks, wait for 5 + seconds, unfreeze tasks and enable nonboot CPUs. If it is in + the 'test' mode, writing 'disk' to /sys/power/state will cause + the kernel to disable nonboot CPUs and freeze tasks, shrink + memory, suspend devices, wait for 5 seconds, resume devices, + unfreeze tasks and enable nonboot CPUs. Then, we are able to + look in the log messages and work out, for example, which code + is being slow and which device drivers are misbehaving. + The suspend-to-disk method may be chosen by writing to this file one of the accepted strings: @@ -46,6 +59,8 @@ Description: 'platform' 'shutdown' 'reboot' + 'testproc' + 'test' It will only change to 'firmware' or 'platform' if the system supports that. diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 3bf5086..db9499a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -9,7 +9,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \ kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ procfs-guide.xml writing_usb_driver.xml \ - kernel-api.xml filesystems.xml journal-api.xml lsm.xml usb.xml \ + kernel-api.xml filesystems.xml lsm.xml usb.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4785032..39fa2ab 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl @@ -98,4 +98,304 @@ </sect1> </chapter> + <chapter id="LinuxJDBAPI"> + <chapterinfo> + <title>The Linux Journalling API</title> + + <authorgroup> + <author> + <firstname>Roger</firstname> + <surname>Gammans</surname> + <affiliation> + <address> + <email>rgammans@computer-surgery.co.uk</email> + </address> + </affiliation> + </author> + </authorgroup> + + <authorgroup> + <author> + <firstname>Stephen</firstname> + <surname>Tweedie</surname> + <affiliation> + <address> + <email>sct@redhat.com</email> + </address> + </affiliation> + </author> + </authorgroup> + + <copyright> + <year>2002</year> + <holder>Roger Gammans</holder> + </copyright> + </chapterinfo> + + <title>The Linux Journalling API</title> + + <sect1> + <title>Overview</title> + <sect2> + <title>Details</title> +<para> +The journalling layer is easy to use. You need to +first of all create a journal_t data structure. There are +two calls to do this dependent on how you decide to allocate the physical +media on which the journal resides. The journal_init_inode() call +is for journals stored in filesystem inodes, or the journal_init_dev() +call can be use for journal stored on a raw device (in a continuous range +of blocks). A journal_t is a typedef for a struct pointer, so when +you are finally finished make sure you call journal_destroy() on it +to free up any used kernel memory. +</para> + +<para> +Once you have got your journal_t object you need to 'mount' or load the journal +file, unless of course you haven't initialised it yet - in which case you +need to call journal_create(). +</para> + +<para> +Most of the time however your journal file will already have been created, but +before you load it you must call journal_wipe() to empty the journal file. +Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the +job of the client file system to detect this and skip the call to journal_wipe(). +</para> + +<para> +In either case the next call should be to journal_load() which prepares the +journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() +for you if it detects any outstanding transactions in the journal and similarly +journal_load() will call journal_recover() if necessary. +I would advise reading fs/ext3/super.c for examples on this stage. +[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly +complicate the API. Or isn't a good idea for the journal layer to hide +dirty mounts from the client fs] +</para> + +<para> +Now you can go ahead and start modifying the underlying +filesystem. Almost. +</para> + +<para> + +You still need to actually journal your filesystem changes, this +is done by wrapping them into transactions. Additionally you +also need to wrap the modification of each of the buffers +with calls to the journal layer, so it knows what the modifications +you are actually making are. To do this use journal_start() which +returns a transaction handle. +</para> + +<para> +journal_start() +and its counterpart journal_stop(), which indicates the end of a transaction +are nestable calls, so you can reenter a transaction if necessary, +but remember you must call journal_stop() the same number of times as +journal_start() before the transaction is completed (or more accurately +leaves the update phase). Ext3/VFS makes use of this feature to simplify +quota support. +</para> + +<para> +Inside each transaction you need to wrap the modifications to the +individual buffers (blocks). Before you start to modify a buffer you +need to call journal_get_{create,write,undo}_access() as appropriate, +this allows the journalling layer to copy the unmodified data if it +needs to. After all the buffer may be part of a previously uncommitted +transaction. +At this point you are at last ready to modify a buffer, and once +you are have done so you need to call journal_dirty_{meta,}data(). +Or if you've asked for access to a buffer you now know is now longer +required to be pushed back on the device you can call journal_forget() +in much the same way as you might have used bforget() in the past. +</para> + +<para> +A journal_flush() may be called at any time to commit and checkpoint +all your transactions. +</para> + +<para> +Then at umount time , in your put_super() (2.4) or write_super() (2.5) +you can then call journal_destroy() to clean up your in-core journal object. +</para> + +<para> +Unfortunately there a couple of ways the journal layer can cause a deadlock. +The first thing to note is that each task can only have +a single outstanding transaction at any one time, remember nothing +commits until the outermost journal_stop(). This means +you must complete the transaction at the end of each file/inode/address +etc. operation you perform, so that the journalling system isn't re-entered +on another journal. Since transactions can't be nested/batched +across differing journals, and another filesystem other than +yours (say ext3) may be modified in a later syscall. +</para> + +<para> +The second case to bear in mind is that journal_start() can +block if there isn't enough space in the journal for your transaction +(based on the passed nblocks param) - when it blocks it merely(!) needs to +wait for transactions to complete and be committed from other tasks, +so essentially we are waiting for journal_stop(). So to avoid +deadlocks you must treat journal_start/stop() as if they +were semaphores and include them in your semaphore ordering rules to prevent +deadlocks. Note that journal_extend() has similar blocking behaviour to +journal_start() so you can deadlock here just as easily as on journal_start(). +</para> + +<para> +Try to reserve the right number of blocks the first time. ;-). This will +be the maximum number of blocks you are going to touch in this transaction. +I advise having a look at at least ext3_jbd.h to see the basis on which +ext3 uses to make these decisions. +</para> + +<para> +Another wriggle to watch out for is your on-disk block allocation strategy. +why? Because, if you undo a delete, you need to ensure you haven't reused any +of the freed blocks in a later transaction. One simple way of doing this +is make sure any blocks you allocate only have checkpointed transactions +listed against them. Ext3 does this in ext3_test_allocatable(). +</para> + +<para> +Lock is also providing through journal_{un,}lock_updates(), +ext3 uses this when it wants a window with a clean and stable fs for a moment. +eg. +</para> + +<programlisting> + + journal_lock_updates() //stop new stuff happening.. + journal_flush() // checkpoint everything. + ..do stuff on stable fs + journal_unlock_updates() // carry on with filesystem use. +</programlisting> + +<para> +The opportunities for abuse and DOS attacks with this should be obvious, +if you allow unprivileged userspace to trigger codepaths containing these +calls. +</para> + +<para> +A new feature of jbd since 2.5.25 is commit callbacks with the new +journal_callback_set() function you can now ask the journalling layer +to call you back when the transaction is finally committed to disk, so that +you can do some of your own management. The key to this is the journal_callback +struct, this maintains the internal callback information but you can +extend it like this:- +</para> +<programlisting> + struct myfs_callback_s { + //Data structure element required by jbd.. + struct journal_callback for_jbd; + // Stuff for myfs allocated together. + myfs_inode* i_commited; + + } +</programlisting> + +<para> +this would be useful if you needed to know when data was committed to a +particular inode. +</para> + + </sect2> + + <sect2> + <title>Summary</title> +<para> +Using the journal is a matter of wrapping the different context changes, +being each mount, each modification (transaction) and each changed buffer +to tell the journalling layer about them. +</para> + +<para> +Here is a some pseudo code to give you an idea of how it works, as +an example. +</para> + +<programlisting> + journal_t* my_jnrl = journal_create(); + journal_init_{dev,inode}(jnrl,...) + if (clean) journal_wipe(); + journal_load(); + + foreach(transaction) { /*transactions must be + completed before + a syscall returns to + userspace*/ + + handle_t * xct=journal_start(my_jnrl); + foreach(bh) { + journal_get_{create,write,undo}_access(xact,bh); + if ( myfs_modify(bh) ) { /* returns true + if makes changes */ + journal_dirty_{meta,}data(xact,bh); + } else { + journal_forget(bh); + } + } + journal_stop(xct); + } + journal_destroy(my_jrnl); +</programlisting> + </sect2> + + </sect1> + + <sect1> + <title>Data Types</title> + <para> + The journalling layer uses typedefs to 'hide' the concrete definitions + of the structures used. As a client of the JBD layer you can + just rely on the using the pointer as a magic cookie of some sort. + + Obviously the hiding is not enforced as this is 'C'. + </para> + <sect2><title>Structures</title> +!Iinclude/linux/jbd.h + </sect2> + </sect1> + + <sect1> + <title>Functions</title> + <para> + The functions here are split into two groups those that + affect a journal as a whole, and those which are used to + manage transactions + </para> + <sect2><title>Journal Level</title> +!Efs/jbd/journal.c +!Ifs/jbd/recovery.c + </sect2> + <sect2><title>Transasction Level</title> +!Efs/jbd/transaction.c + </sect2> + </sect1> + <sect1> + <title>See also</title> + <para> + <citation> + <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz"> + Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie + </ulink> + </citation> + </para> + <para> + <citation> + <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html"> + Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie + </ulink> + </citation> + </para> + </sect1> + + </chapter> + </book> diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl deleted file mode 100644 index 2077f9a..0000000 --- a/Documentation/DocBook/journal-api.tmpl +++ /dev/null @@ -1,333 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" - "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> - -<book id="LinuxJBDAPI"> - <bookinfo> - <title>The Linux Journalling API</title> - <authorgroup> - <author> - <firstname>Roger</firstname> - <surname>Gammans</surname> - <affiliation> - <address> - <email>rgammans@computer-surgery.co.uk</email> - </address> - </affiliation> - </author> - </authorgroup> - - <authorgroup> - <author> - <firstname>Stephen</firstname> - <surname>Tweedie</surname> - <affiliation> - <address> - <email>sct@redhat.com</email> - </address> - </affiliation> - </author> - </authorgroup> - - <copyright> - <year>2002</year> - <holder>Roger Gammans</holder> - </copyright> - -<legalnotice> - <para> - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - </para> - - <para> - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - </para> - - <para> - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - </para> - - <para> - For more details see the file COPYING in the source - distribution of Linux. - </para> - </legalnotice> - </bookinfo> - -<toc></toc> - - <chapter id="Overview"> - <title>Overview</title> - <sect1> - <title>Details</title> -<para> -The journalling layer is easy to use. You need to -first of all create a journal_t data structure. There are -two calls to do this dependent on how you decide to allocate the physical -media on which the journal resides. The journal_init_inode() call -is for journals stored in filesystem inodes, or the journal_init_dev() -call can be use for journal stored on a raw device (in a continuous range -of blocks). A journal_t is a typedef for a struct pointer, so when -you are finally finished make sure you call journal_destroy() on it -to free up any used kernel memory. -</para> - -<para> -Once you have got your journal_t object you need to 'mount' or load the journal -file, unless of course you haven't initialised it yet - in which case you -need to call journal_create(). -</para> - -<para> -Most of the time however your journal file will already have been created, but -before you load it you must call journal_wipe() to empty the journal file. -Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the -job of the client file system to detect this and skip the call to journal_wipe(). -</para> - -<para> -In either case the next call should be to journal_load() which prepares the -journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() -for you if it detects any outstanding transactions in the journal and similarly -journal_load() will call journal_recover() if necessary. -I would advise reading fs/ext3/super.c for examples on this stage. -[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly -complicate the API. Or isn't a good idea for the journal layer to hide -dirty mounts from the client fs] -</para> - -<para> -Now you can go ahead and start modifying the underlying -filesystem. Almost. -</para> - - -<para> - -You still need to actually journal your filesystem changes, this -is done by wrapping them into transactions. Additionally you -also need to wrap the modification of each of the buffers -with calls to the journal layer, so it knows what the modifications -you are actually making are. To do this use journal_start() which -returns a transaction handle. -</para> - -<para> -journal_start() -and its counterpart journal_stop(), which indicates the end of a transaction -are nestable calls, so you can reenter a transaction if necessary, -but remember you must call journal_stop() the same number of times as -journal_start() before the transaction is completed (or more accurately -leaves the update phase). Ext3/VFS makes use of this feature to simplify -quota support. -</para> - -<para> -Inside each transaction you need to wrap the modifications to the -individual buffers (blocks). Before you start to modify a buffer you -need to call journal_get_{create,write,undo}_access() as appropriate, -this allows the journalling layer to copy the unmodified data if it -needs to. After all the buffer may be part of a previously uncommitted -transaction. -At this point you are at last ready to modify a buffer, and once -you are have done so you need to call journal_dirty_{meta,}data(). -Or if you've asked for access to a buffer you now know is now longer -required to be pushed back on the device you can call journal_forget() -in much the same way as you might have used bforget() in the past. -</para> - -<para> -A journal_flush() may be called at any time to commit and checkpoint -all your transactions. -</para> - -<para> -Then at umount time , in your put_super() (2.4) or write_super() (2.5) -you can then call journal_destroy() to clean up your in-core journal object. -</para> - - -<para> -Unfortunately there a couple of ways the journal layer can cause a deadlock. -The first thing to note is that each task can only have -a single outstanding transaction at any one time, remember nothing -commits until the outermost journal_stop(). This means -you must complete the transaction at the end of each file/inode/address -etc. operation you perform, so that the journalling system isn't re-entered -on another journal. Since transactions can't be nested/batched -across differing journals, and another filesystem other than -yours (say ext3) may be modified in a later syscall. -</para> - -<para> -The second case to bear in mind is that journal_start() can -block if there isn't enough space in the journal for your transaction -(based on the passed nblocks param) - when it blocks it merely(!) needs to -wait for transactions to complete and be committed from other tasks, -so essentially we are waiting for journal_stop(). So to avoid -deadlocks you must treat journal_start/stop() as if they -were semaphores and include them in your semaphore ordering rules to prevent -deadlocks. Note that journal_extend() has similar blocking behaviour to -journal_start() so you can deadlock here just as easily as on journal_start(). -</para> - -<para> -Try to reserve the right number of blocks the first time. ;-). This will -be the maximum number of blocks you are going to touch in this transaction. -I advise having a look at at least ext3_jbd.h to see the basis on which -ext3 uses to make these decisions. -</para> - -<para> -Another wriggle to watch out for is your on-disk block allocation strategy. -why? Because, if you undo a delete, you need to ensure you haven't reused any -of the freed blocks in a later transaction. One simple way of doing this -is make sure any blocks you allocate only have checkpointed transactions -listed against them. Ext3 does this in ext3_test_allocatable(). -</para> - -<para> -Lock is also providing through journal_{un,}lock_updates(), -ext3 uses this when it wants a window with a clean and stable fs for a moment. -eg. -</para> - -<programlisting> - - journal_lock_updates() //stop new stuff happening.. - journal_flush() // checkpoint everything. - ..do stuff on stable fs - journal_unlock_updates() // carry on with filesystem use. -</programlisting> - -<para> -The opportunities for abuse and DOS attacks with this should be obvious, -if you allow unprivileged userspace to trigger codepaths containing these -calls. -</para> - -<para> -A new feature of jbd since 2.5.25 is commit callbacks with the new -journal_callback_set() function you can now ask the journalling layer -to call you back when the transaction is finally committed to disk, so that -you can do some of your own management. The key to this is the journal_callback -struct, this maintains the internal callback information but you can -extend it like this:- -</para> -<programlisting> - struct myfs_callback_s { - //Data structure element required by jbd.. - struct journal_callback for_jbd; - // Stuff for myfs allocated together. - myfs_inode* i_commited; - - } -</programlisting> - -<para> -this would be useful if you needed to know when data was committed to a -particular inode. -</para> - -</sect1> - -<sect1> -<title>Summary</title> -<para> -Using the journal is a matter of wrapping the different context changes, -being each mount, each modification (transaction) and each changed buffer -to tell the journalling layer about them. -</para> - -<para> -Here is a some pseudo code to give you an idea of how it works, as -an example. -</para> - -<programlisting> - journal_t* my_jnrl = journal_create(); - journal_init_{dev,inode}(jnrl,...) - if (clean) journal_wipe(); - journal_load(); - - foreach(transaction) { /*transactions must be - completed before - a syscall returns to - userspace*/ - - handle_t * xct=journal_start(my_jnrl); - foreach(bh) { - journal_get_{create,write,undo}_access(xact,bh); - if ( myfs_modify(bh) ) { /* returns true - if makes changes */ - journal_dirty_{meta,}data(xact,bh); - } else { - journal_forget(bh); - } - } - journal_stop(xct); - } - journal_destroy(my_jrnl); -</programlisting> -</sect1> - -</chapter> - - <chapter id="adt"> - <title>Data Types</title> - <para> - The journalling layer uses typedefs to 'hide' the concrete definitions - of the structures used. As a client of the JBD layer you can - just rely on the using the pointer as a magic cookie of some sort. - - Obviously the hiding is not enforced as this is 'C'. - </para> - <sect1><title>Structures</title> -!Iinclude/linux/jbd.h - </sect1> -</chapter> - - <chapter id="calls"> - <title>Functions</title> - <para> - The functions here are split into two groups those that - affect a journal as a whole, and those which are used to - manage transactions -</para> - <sect1><title>Journal Level</title> -!Efs/jbd/journal.c -!Ifs/jbd/recovery.c - </sect1> - <sect1><title>Transasction Level</title> -!Efs/jbd/transaction.c - </sect1> -</chapter> -<chapter> - <title>See also</title> - <para> - <citation> - <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz"> - Journaling the Linux ext2fs Filesystem,LinuxExpo 98, Stephen Tweedie - </ulink> - </citation> - </para> - <para> - <citation> - <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html"> - Ext3 Journalling FileSystem , OLS 2000, Dr. Stephen Tweedie - </ulink> - </citation> - </para> -</chapter> - -</book> diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index b11792a..bf2b0e2 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -49,7 +49,7 @@ __u64 stime, utime; } /* Maximum size of response requested or message sent */ -#define MAX_MSG_SIZE 256 +#define MAX_MSG_SIZE 1024 /* Maximum number of cpus expected to be specified in a cpumask */ #define MAX_CPUS 32 /* Maximum length of pathname to log file */ diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index c65233d..284e7e1 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -17,7 +17,7 @@ are: special place-holders for where the extracted documentation should go. -- scripts/docproc.c +- scripts/basic/docproc.c This is a program for converting SGML template files into SGML files. When a file is referenced it is searched for symbols diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index a66bec2..74311d7 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -30,6 +30,17 @@ testing). The system will support either 'firmware' or 'platform', and that is known a priori. But, the user may choose 'shutdown' or 'reboot' as alternatives. +Additionally, /sys/power/disk can be used to turn on one of the two testing +modes of the suspend-to-disk mechanism: 'testproc' or 'test'. If the +suspend-to-disk mechanism is in the 'testproc' mode, writing 'disk' to +/sys/power/state will cause the kernel to disable nonboot CPUs and freeze +tasks, wait for 5 seconds, unfreeze tasks and enable nonboot CPUs. If it is +in the 'test' mode, writing 'disk' to /sys/power/state will cause the kernel +to disable nonboot CPUs and freeze tasks, shrink memory, suspend devices, wait +for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs. Then, +we are able to look in the log messages and work out, for example, which code +is being slow and which device drivers are misbehaving. + Reading from this file will display what the mode is currently set to. Writing to this file will accept one of @@ -37,6 +48,8 @@ to. Writing to this file will accept one of 'platform' 'shutdown' 'reboot' + 'testproc' + 'test' It will only change to 'firmware' or 'platform' if the system supports it. diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index ab974ff..22e4c466 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -70,7 +70,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return #define PREFIX "ACPI: " -int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ +int acpi_noirq; /* skip ACPI IRQ initialization */ int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ int acpi_ht __initdata = 1; /* enable HT */ diff --git a/arch/um/include/sysdep-i386/barrier.h b/arch/um/include/sysdep-i386/barrier.h new file mode 100644 index 0000000..b58d52c --- /dev/null +++ b/arch/um/include/sysdep-i386/barrier.h @@ -0,0 +1,9 @@ +#ifndef __SYSDEP_I386_BARRIER_H +#define __SYSDEP_I386_BARRIER_H + +/* Copied from include/asm-i386 for use by userspace. i386 has the option + * of using mfence, but I'm just using this, which works everywhere, for now. + */ +#define mb() asm volatile("lock; addl $0,0(%esp)") + +#endif diff --git a/arch/um/include/sysdep-x86_64/barrier.h b/arch/um/include/sysdep-x86_64/barrier.h new file mode 100644 index 0000000..7b610be --- /dev/null +++ b/arch/um/include/sysdep-x86_64/barrier.h @@ -0,0 +1,7 @@ +#ifndef __SYSDEP_X86_64_BARRIER_H +#define __SYSDEP_X86_64_BARRIER_H + +/* Copied from include/asm-x86_64 for use by userspace. */ +#define mb() asm volatile("mfence":::"memory") + +#endif diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 51f0893..c692a19 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -7,7 +7,6 @@ #include <stdio.h> #include <errno.h> #include <signal.h> -#include <linux/unistd.h> #include <sys/mman.h> #include <sys/wait.h> #include <sys/mman.h> diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 6b81739..b897e85 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -15,6 +15,7 @@ #include "user.h" #include "signal_kern.h" #include "sysdep/sigcontext.h" +#include "sysdep/barrier.h" #include "sigcontext.h" #include "mode.h" #include "os.h" @@ -34,8 +35,12 @@ #define SIGALRM_BIT 2 #define SIGALRM_MASK (1 << SIGALRM_BIT) -static int signals_enabled = 1; -static int pending = 0; +/* These are used by both the signal handlers and + * block/unblock_signals. I don't want modifications cached in a + * register - they must go straight to memory. + */ +static volatile int signals_enabled = 1; +static volatile int pending = 0; void sig_handler(int sig, struct sigcontext *sc) { @@ -152,6 +157,12 @@ int change_sig(int signal, int on) void block_signals(void) { signals_enabled = 0; + /* This must return with signals disabled, so this barrier + * ensures that writes are flushed out before the return. + * This might matter if gcc figures out how to inline this and + * decides to shuffle this code into the caller. + */ + mb(); } void unblock_signals(void) @@ -171,9 +182,23 @@ void unblock_signals(void) */ signals_enabled = 1; + /* Setting signals_enabled and reading pending must + * happen in this order. + */ + mb(); + save_pending = pending; - if(save_pending == 0) + if(save_pending == 0){ + /* This must return with signals enabled, so + * this barrier ensures that writes are + * flushed out before the return. This might + * matter if gcc figures out how to inline + * this (unlikely, given its size) and decides + * to shuffle this code into the caller. + */ + mb(); return; + } pending = 0; diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index cb9ab54..9b34fe6 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -14,7 +14,7 @@ #include <sys/mman.h> #include <sys/user.h> #include <sys/time.h> -#include <asm/unistd.h> +#include <sys/syscall.h> #include <asm/types.h> #include "user.h" #include "sysdep/ptrace.h" diff --git a/arch/um/os-Linux/tls.c b/arch/um/os-Linux/tls.c index 9f7999f..16215b9 100644 --- a/arch/um/os-Linux/tls.c +++ b/arch/um/os-Linux/tls.c @@ -1,7 +1,7 @@ #include <errno.h> +#include <unistd.h> #include <sys/ptrace.h> #include <sys/syscall.h> -#include <unistd.h> #include <asm/ldt.h> #include "sysdep/tls.h" #include "uml-config.h" diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index c7b1dac..9eaee66 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3075,11 +3075,12 @@ end_io: if (maxsector) { sector_t sector = bio->bi_sector; - if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { + if (maxsector < nr_sectors || + maxsector - nr_sectors < sector) { /* - * This may well happen - partitions are not checked - * to make sure they are within the size of the - * whole device. + * This may well happen - partitions are not + * checked to make sure they are within the size + * of the whole device. */ handle_bad_sector(bio); goto end_io; diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index e5cfb1f..157fa81 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1867,7 +1867,7 @@ static int ipmi_pci_resume(struct pci_dev *pdev) static struct pci_device_id ipmi_pci_devices[] = { { PCI_DEVICE(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID) }, - { PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE) } + { PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE_MASK) } }; MODULE_DEVICE_TABLE(pci, ipmi_pci_devices); diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 4bde30b..75e9e38 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -230,34 +230,43 @@ static struct kobj_type ktype_memctrl = { */ static int edac_sysfs_memctrl_setup(void) { - int err=0; + int err = 0; debugf1("%s()\n", __func__); /* create the /sys/devices/system/edac directory */ err = sysdev_class_register(&edac_class); - if (!err) { - /* Init the MC's kobject */ - memset(&edac_memctrl_kobj, 0, sizeof (edac_memctrl_kobj)); - edac_memctrl_kobj.parent = &edac_class.kset.kobj; - edac_memctrl_kobj.ktype = &ktype_memctrl; + if (err) { + debugf1("%s() error=%d\n", __func__, err); + return err; + } - /* generate sysfs "..../edac/mc" */ - err = kobject_set_name(&edac_memctrl_kobj,"mc"); + /* Init the MC's kobject */ + memset(&edac_memctrl_kobj, 0, sizeof (edac_memctrl_kobj)); + edac_memctrl_kobj.parent = &edac_class.kset.kobj; + edac_memctrl_kobj.ktype = &ktype_memctrl; - if (!err) { - /* FIXME: maybe new sysdev_create_subdir() */ - err = kobject_register(&edac_memctrl_kobj); + /* generate sysfs "..../edac/mc" */ + err = kobject_set_name(&edac_memctrl_kobj,"mc"); - if (err) - debugf1("Failed to register '.../edac/mc'\n"); - else - debugf1("Registered '.../edac/mc' kobject\n"); - } - } else - debugf1("%s() error=%d\n", __func__, err); + if (err) + goto fail; + + /* FIXME: maybe new sysdev_create_subdir() */ + err = kobject_register(&edac_memctrl_kobj); + + if (err) { + debugf1("Failed to register '.../edac/mc'\n"); + goto fail; + } + debugf1("Registered '.../edac/mc' kobject\n"); + + return 0; + +fail: + sysdev_class_unregister(&edac_class); return err; } diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c index 2b0ea8b..753fe0e 100644 --- a/drivers/ide/pci/amd74xx.c +++ b/drivers/ide/pci/amd74xx.c @@ -75,6 +75,7 @@ static struct amd_ide_chip { { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, 0x50, AMD_UDMA_133 }, { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, 0x50, AMD_UDMA_133 }, { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, 0x50, AMD_UDMA_133 }, + { PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, 0x50, AMD_UDMA_133 }, { PCI_DEVICE_ID_AMD_CS5536_IDE, 0x40, AMD_UDMA_100 }, { 0 } }; @@ -491,7 +492,8 @@ static ide_pci_device_t amd74xx_chipsets[] __devinitdata = { /* 16 */ DECLARE_NV_DEV("NFORCE-MCP55"), /* 17 */ DECLARE_NV_DEV("NFORCE-MCP61"), /* 18 */ DECLARE_NV_DEV("NFORCE-MCP65"), - /* 19 */ DECLARE_AMD_DEV("AMD5536"), + /* 19 */ DECLARE_NV_DEV("NFORCE-MCP67"), + /* 20 */ DECLARE_AMD_DEV("AMD5536"), }; static int __devinit amd74xx_probe(struct pci_dev *dev, const struct pci_device_id *id) @@ -530,7 +532,8 @@ static struct pci_device_id amd74xx_pci_tbl[] = { { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 16 }, { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 17 }, { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 18 }, - { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 19 }, + { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 19 }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 20 }, { 0, }, }; MODULE_DEVICE_TABLE(pci, amd74xx_pci_tbl); diff --git a/drivers/isdn/hysdn/hysdn_sched.c b/drivers/isdn/hysdn/hysdn_sched.c index 1fadf01..1875877 100644 --- a/drivers/isdn/hysdn/hysdn_sched.c +++ b/drivers/isdn/hysdn/hysdn_sched.c @@ -155,21 +155,17 @@ hysdn_tx_cfgline(hysdn_card *card, unsigned char *line, unsigned short chan) if (card->debug_flags & LOG_SCHED_ASYN) hysdn_addlog(card, "async tx-cfg chan=%d len=%d", chan, strlen(line) + 1); - spin_lock_irqsave(&card->hysdn_lock, flags); while (card->async_busy) { - sti(); if (card->debug_flags & LOG_SCHED_ASYN) hysdn_addlog(card, "async tx-cfg delayed"); msleep_interruptible(20); /* Timeout 20ms */ - if (!--cnt) { - spin_unlock_irqrestore(&card->hysdn_lock, flags); + if (!--cnt) return (-ERR_ASYNC_TIME); /* timed out */ - } - cli(); } /* wait for buffer to become free */ + spin_lock_irqsave(&card->hysdn_lock, flags); strcpy(card->async_data, line); card->async_len = strlen(line) + 1; card->async_channel = chan; @@ -177,30 +173,23 @@ hysdn_tx_cfgline(hysdn_card *card, unsigned char *line, unsigned short chan) /* now queue the task */ schedule_work(&card->irq_queue); - sti(); + spin_unlock_irqrestore(&card->hysdn_lock, flags); if (card->debug_flags & LOG_SCHED_ASYN) hysdn_addlog(card, "async tx-cfg data queued"); cnt++; /* short delay */ - cli(); while (card->async_busy) { - sti(); if (card->debug_flags & LOG_SCHED_ASYN) hysdn_addlog(card, "async tx-cfg waiting for tx-ready"); msleep_interruptible(20); /* Timeout 20ms */ - if (!--cnt) { - spin_unlock_irqrestore(&card->hysdn_lock, flags); + if (!--cnt) return (-ERR_ASYNC_TIME); /* timed out */ - } - cli(); } /* wait for buffer to become free again */ - spin_unlock_irqrestore(&card->hysdn_lock, flags); - if (card->debug_flags & LOG_SCHED_ASYN) hysdn_addlog(card, "async tx-cfg data send"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 50ab4a9..d111356 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3200,6 +3200,7 @@ static int do_md_run(mddev_t * mddev) mddev->changed = 1; md_new_event(mddev); + kobject_uevent(&mddev->gendisk->kobj, KOBJ_ONLINE); return 0; } @@ -3313,6 +3314,7 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + kobject_uevent(&mddev->gendisk->kobj, KOBJ_OFFLINE); if (mddev->ro) mddev->ro = 0; } diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index bbdba7b..46a9c35 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -44,12 +44,14 @@ */ #include <linux/kernel.h> +#include <linux/fs.h> #include <linux/module.h> +#include <linux/buffer_head.h> #include <linux/kprobes.h> -#include <linux/kallsyms.h> +#include <linux/list.h> #include <linux/init.h> -#include <linux/irq.h> #include <linux/interrupt.h> +#include <linux/hrtimer.h> #include <scsi/scsi_cmnd.h> #ifdef CONFIG_IDE @@ -116,16 +118,16 @@ static enum ctype cptype = NONE; static int count = DEFAULT_COUNT; module_param(recur_count, int, 0644); -MODULE_PARM_DESC(recur_count, "Recurcion level for the stack overflow test,\ - default is 10"); +MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test, "\ + "default is 10"); module_param(cpoint_name, charp, 0644); -MODULE_PARM_DESC(cpoint_name, "Crash Point, where kernel is to be crashed"); -module_param(cpoint_type, charp, 06444); -MODULE_PARM_DESC(cpoint_type, "Crash Point Type, action to be taken on\ - hitting the crash point"); -module_param(cpoint_count, int, 06444); -MODULE_PARM_DESC(cpoint_count, "Crash Point Count, number of times the \ - crash point is to be hit to trigger action"); +MODULE_PARM_DESC(cpoint_name, " Crash Point, where kernel is to be crashed"); +module_param(cpoint_type, charp, 0644); +MODULE_PARM_DESC(cpoint_type, " Crash Point Type, action to be taken on "\ + "hitting the crash point"); +module_param(cpoint_count, int, 0644); +MODULE_PARM_DESC(cpoint_count, " Crash Point Count, number of times the "\ + "crash point is to be hit to trigger action"); unsigned int jp_do_irq(unsigned int irq) { diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 146298a..c3c0626 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -281,7 +281,6 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n) up(&board_lock); return 0; } -EXPORT_SYMBOL_GPL(spi_register_board_info); /* FIXME someone should add support for a __setup("spi", ...) that * creates board info from kernel command lines diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 976a691..7e056b9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1806,13 +1806,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } if ((rc < 0) || (smb_read_data == NULL)) { cFYI(1, ("Read error in readpages: %d", rc)); - /* clean up remaing pages off list */ - while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, - lru); - list_del(&page->lru); - page_cache_release(page); - } break; } else if (bytes_read > 0) { pSMBr = (struct smb_com_read_rsp *)smb_read_data; @@ -1831,13 +1824,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, this case is ok - if we are at server EOF we will hit it on next read */ - /* while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, - struct page, list); - list_del(&page->list); - page_cache_release(page); - } - break; */ + /* break; */ } } else { cFYI(1, ("No bytes read (%d) at offset %lld . " @@ -1845,14 +1832,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read, offset)); /* BB turn off caching and do new lookup on file size at server? */ - while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, - lru); - list_del(&page->lru); - - /* BB removeme - replace with zero of page? */ - page_cache_release(page); - } break; } if (smb_read_data) { diff --git a/fs/compat.c b/fs/compat.c index 50624d4..8d0a001 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1835,9 +1835,12 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); - if (ret == 0 && tsp && !(current->personality & STICKY_TIMEOUTS)) { + if (tsp) { struct compat_timespec rts; + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + rts.tv_sec = timeout / HZ; rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); if (rts.tv_nsec >= NSEC_PER_SEC) { @@ -1846,8 +1849,19 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) - ret = -EFAULT; + if (copy_to_user(tsp, &rts, sizeof(rts))) { +sticky: + /* + * If an application puts its timeval in read-only + * memory, we don't want the Linux-specific update to + * the timeval to cause a fault after the select has + * completed successfully. However, because we're not + * updating the timeval, we can't restart the system + * call. + */ + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + } } if (ret == -ERESTARTNOHAND) { diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f49f105..136175a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -134,7 +134,7 @@ int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, algified_name_len = (chaining_modifier_len + cipher_name_len + 3); (*algified_name) = kmalloc(algified_name_len, GFP_KERNEL); - if (!(algified_name)) { + if (!(*algified_name)) { rc = -ENOMEM; goto out; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2bb5ace..763a50d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -397,14 +397,14 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, err = -EIO; if (is_bad_inode(inode)) - goto clean_pages_up; + goto out; data.file = file; data.inode = inode; data.req = fuse_get_req(fc); err = PTR_ERR(data.req); if (IS_ERR(data.req)) - goto clean_pages_up; + goto out; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { @@ -413,10 +413,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, else fuse_put_request(fc, data.req); } - return err; - -clean_pages_up: - put_pages_list(pages); +out: return err; } diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 8d5963c..015640b 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -337,13 +337,6 @@ out: out_noerror: ret = 0; out_unlock: - /* unlock all pages, we can't do any I/O right now */ - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); - unlock_page(page); - page_cache_release(page); - } if (do_unlock) gfs2_holder_uninit(&gh); goto out; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e9d0770..81b8565 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -274,7 +274,7 @@ nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry) * any regular files anyway, just in case the directory was created by * a kernel from the future.... */ nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); status = vfs_rmdir(dir->d_inode, dentry); mutex_unlock(&dir->d_inode->i_mutex); return status; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 9041802..1724999 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1619,6 +1619,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "jmacd-8: reiserfs_fill_super: unable to read bitmap"); goto error; } + errval = -EINVAL; #ifdef CONFIG_REISERFS_CHECK SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON"); SWARN(silent, s, "- it is slow mode for debugging."); @@ -48,14 +48,21 @@ xattr_permission(struct inode *inode, const char *name, int mask) return 0; /* - * The trusted.* namespace can only accessed by a privilegued user. + * The trusted.* namespace can only be accessed by a privileged user. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); + /* In user.* namespace, only regular files and directories can have + * extended attributes. For sticky directories, only the owner and + * privileged user can write attributes. + */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { - if (!S_ISREG(inode->i_mode) && - (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EPERM; + if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && + (mask & MAY_WRITE) && (current->fsuid != inode->i_uid) && + !capable(CAP_FOWNER)) return -EPERM; } diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h index eac85ce..c6a0318 100644 --- a/include/asm-powerpc/systbl.h +++ b/include/asm-powerpc/systbl.h @@ -261,7 +261,7 @@ SYSX(sys_ni_syscall, ppc_fadvise64_64, ppc_fadvise64_64) PPC_SYS_SPU(rtas) OLDSYS(debug_setcontext) SYSCALL(ni_syscall) -SYSCALL(ni_syscall) +COMPAT_SYS(migrate_pages) COMPAT_SYS(mbind) COMPAT_SYS(get_mempolicy) COMPAT_SYS(set_mempolicy) diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index 464a48c..b5fe932 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -276,7 +276,7 @@ #define __NR_rtas 255 #define __NR_sys_debug_setcontext 256 /* Number 257 is reserved for vserver */ -/* 258 currently unused */ +#define __NR_migrate_pages 258 #define __NR_mbind 259 #define __NR_get_mempolicy 260 #define __NR_set_mempolicy 261 diff --git a/include/linux/compat.h b/include/linux/compat.h index f155319..80b17f4 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -230,5 +230,9 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80f39ca..24b6111 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -171,6 +171,8 @@ __attribute_const__ roundup_pow_of_two(unsigned long x) extern int printk_ratelimit(void); extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst); +extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msec); static inline void console_silent(void) { diff --git a/include/linux/pm.h b/include/linux/pm.h index 6b27e07..070394e 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -116,7 +116,9 @@ typedef int __bitwise suspend_disk_method_t; #define PM_DISK_PLATFORM ((__force suspend_disk_method_t) 2) #define PM_DISK_SHUTDOWN ((__force suspend_disk_method_t) 3) #define PM_DISK_REBOOT ((__force suspend_disk_method_t) 4) -#define PM_DISK_MAX ((__force suspend_disk_method_t) 5) +#define PM_DISK_TEST ((__force suspend_disk_method_t) 5) +#define PM_DISK_TESTPROC ((__force suspend_disk_method_t) 6) +#define PM_DISK_MAX ((__force suspend_disk_method_t) 7) struct pm_ops { suspend_disk_method_t pm_disk_mode; diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h index 61eef50..28967ed 100644 --- a/include/linux/ufs_fs.h +++ b/include/linux/ufs_fs.h @@ -908,7 +908,7 @@ struct ufs_super_block_third { __fs64 fs_csaddr; /* blk addr of cyl grp summary area */ __fs64 fs_pendingblocks;/* blocks in process of being freed */ __fs32 fs_pendinginodes;/*inodes in process of being freed */ - } fs_u2; + } __attribute__ ((packed)) fs_u2; } fs_un1; union { struct { @@ -124,6 +124,7 @@ void msg_exit_ns(struct ipc_namespace *ns) } mutex_unlock(&msg_ids(ns).mutex); + ipc_fini_ids(ns->ids[IPC_MSG_IDS]); kfree(ns->ids[IPC_MSG_IDS]); ns->ids[IPC_MSG_IDS] = NULL; } @@ -161,6 +161,7 @@ void sem_exit_ns(struct ipc_namespace *ns) } mutex_unlock(&sem_ids(ns).mutex); + ipc_fini_ids(ns->ids[IPC_SEM_IDS]); kfree(ns->ids[IPC_SEM_IDS]); ns->ids[IPC_SEM_IDS] = NULL; } @@ -116,6 +116,7 @@ void shm_exit_ns(struct ipc_namespace *ns) } mutex_unlock(&shm_ids(ns).mutex); + ipc_fini_ids(ns->ids[IPC_SHM_IDS]); kfree(ns->ids[IPC_SHM_IDS]); ns->ids[IPC_SHM_IDS] = NULL; } @@ -301,7 +301,7 @@ static int grow_ary(struct ipc_ids* ids, int newsize) */ rcu_assign_pointer(ids->entries, new); - ipc_rcu_putref(old); + __ipc_fini_ids(ids, old); return newsize; } @@ -83,6 +83,18 @@ void* ipc_rcu_alloc(int size); void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); +static inline void __ipc_fini_ids(struct ipc_ids *ids, + struct ipc_id_ary *entries) +{ + if (entries != &ids->nullentry) + ipc_rcu_putref(entries); +} + +static inline void ipc_fini_ids(struct ipc_ids *ids) +{ + __ipc_fini_ids(ids, ids->entries); +} + struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id); struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id); void ipc_lock_by_ptr(struct kern_ipc_perm *ipcp); diff --git a/kernel/compat.c b/kernel/compat.c index d4898aa..6952dd0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/futex.c b/kernel/futex.c index b364e00..93ef30b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,6 +1507,13 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + static unsigned long printk_interval; + + if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { + printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " + "will be removed from the kernel in June 2007\n", + current->comm); + } ret = -EINVAL; if (!valid_signal(signal)) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d3a158a..b1fb786 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,7 +71,7 @@ static inline void platform_finish(void) static int prepare_processes(void) { - int error; + int error = 0; pm_prepare_console(); @@ -84,6 +84,12 @@ static int prepare_processes(void) goto thaw; } + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto thaw; + } + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; @@ -120,13 +126,21 @@ int pm_suspend_disk(void) if (error) return error; + if (pm_disk_mode == PM_DISK_TESTPROC) + goto Thaw; + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { resume_console(); printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; + goto Thaw; + } + + if (pm_disk_mode == PM_DISK_TEST) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Done; } pr_debug("PM: snapshotting memory.\n"); @@ -143,16 +157,17 @@ int pm_suspend_disk(void) power_down(pm_disk_mode); else { swsusp_free(); - unprepare_processes(); - return error; + goto Thaw; } - } else + } else { pr_debug("PM: Image restored successfully.\n"); + } swsusp_free(); Done: device_resume(); resume_console(); + Thaw: unprepare_processes(); return error; } @@ -249,6 +264,8 @@ static const char * const pm_disk_modes[] = { [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", [PM_DISK_REBOOT] = "reboot", + [PM_DISK_TEST] = "test", + [PM_DISK_TESTPROC] = "testproc", }; /** @@ -303,17 +320,19 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) } } if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || + mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { pm_disk_mode = mode; - else { + } else { if (pm_ops && pm_ops->enter && (mode == pm_ops->pm_disk_mode)) pm_disk_mode = mode; else error = -EINVAL; } - } else + } else { error = -EINVAL; + } pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); diff --git a/kernel/printk.c b/kernel/printk.c index f7d427e..6642655 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/syscalls.h> +#include <linux/jiffies.h> #include <asm/uaccess.h> @@ -1101,3 +1102,23 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { + *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0e53314..d7306d0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,7 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); /* block-layer dependent */ cond_syscall(sys_bdflush); diff --git a/mm/migrate.c b/mm/migrate.c index ba2453f..b4979d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -952,7 +952,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, goto out; pm[i].node = node; - } + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ } /* End marker */ pm[nr_pages].node = MAX_NUMNODES; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b55bb35..bf2f6cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -853,7 +853,7 @@ again: pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); if (!pcp->count) { - pcp->count += rmqueue_bulk(zone, 0, + pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (unlikely(!pcp->count)) goto failed; diff --git a/mm/readahead.c b/mm/readahead.c index 1ba736a..23cb61a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -173,6 +173,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + /* Clean up the remaining pages */ + put_pages_list(pages); goto out; } @@ -883,7 +883,7 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + per_cpu(reap_node, cpu) = node; } static void next_reap_node(void) diff --git a/scripts/basic/docproc.c b/scripts/basic/docproc.c index 4ab6cbf..d6071cb 100644 --- a/scripts/basic/docproc.c +++ b/scripts/basic/docproc.c @@ -250,7 +250,7 @@ void intfunc(char * filename) { docfunctions(filename, NOFUNCTION); } void extfunc(char * filename) { docfunctions(filename, FUNCTION); } /* - * Document spåecific function(s) in a file. + * Document specific function(s) in a file. * Call kernel-doc with the following parameters: * kernel-doc -docbook -function function1 [-function function2] */ |