From 5070437cd99511f69ae561f2ab417142a47a85ec Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 21 Oct 2010 17:55:01 +0200 Subject: power_supply: Add gpio charger driver This patch adds a simple driver for chargers indicating their online status through a GPIO pin. Signed-off-by: Lars-Peter Clausen Signed-off-by: Anton Vorontsov --- include/linux/power/gpio-charger.h | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 include/linux/power/gpio-charger.h (limited to 'include') diff --git a/include/linux/power/gpio-charger.h b/include/linux/power/gpio-charger.h new file mode 100644 index 0000000..de1dfe0 --- /dev/null +++ b/include/linux/power/gpio-charger.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2010, Lars-Peter Clausen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LINUX_POWER_GPIO_CHARGER_H__ +#define __LINUX_POWER_GPIO_CHARGER_H__ + +#include +#include + +/** + * struct gpio_charger_platform_data - platform_data for gpio_charger devices + * @name: Name for the chargers power_supply device + * @type: Type of the charger + * @gpio: GPIO which is used to indicate the chargers status + * @gpio_active_low: Should be set to 1 if the GPIO is active low otherwise 0 + * @supplied_to: Array of battery names to which this chargers supplies power + * @num_supplicants: Number of entries in the supplied_to array + */ +struct gpio_charger_platform_data { + const char *name; + enum power_supply_type type; + + int gpio; + int gpio_active_low; + + char **supplied_to; + size_t num_supplicants; +}; + +#endif -- cgit v1.1 From 5b275ce27077d6463ca28c9671dce7c2c1f622e2 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 11 Nov 2010 15:27:29 +0000 Subject: thermal: make ops constant And while touching that function definition do something about the disaster of formatting there. Signed-off-by: Alan Cox Acked-by: Zhang Rui Signed-off-by: Len Brown --- include/linux/thermal.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1de8b9eb..0662690 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -77,7 +77,7 @@ struct thermal_cooling_device { char type[THERMAL_NAME_LENGTH]; struct device device; void *devdata; - struct thermal_cooling_device_ops *ops; + const struct thermal_cooling_device_ops *ops; struct list_head node; }; @@ -114,7 +114,7 @@ struct thermal_zone_device { int last_temperature; bool passive; unsigned int forced_passive; - struct thermal_zone_device_ops *ops; + const struct thermal_zone_device_ops *ops; struct list_head cooling_devices; struct idr idr; struct mutex lock; /* protect cooling devices list */ @@ -129,11 +129,8 @@ struct thermal_zone_device { }; struct thermal_zone_device *thermal_zone_device_register(char *, int, void *, - struct - thermal_zone_device_ops - *, int tc1, int tc2, - int passive_freq, - int polling_freq); + const struct thermal_zone_device_ops *, int tc1, int tc2, + int passive_freq, int polling_freq); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, @@ -142,9 +139,7 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, - struct - thermal_cooling_device_ops - *); + const struct thermal_cooling_device_ops *); void thermal_cooling_device_unregister(struct thermal_cooling_device *); #endif /* __THERMAL_H__ */ -- cgit v1.1 From 3e3198f1adda8e0fbd499bde806781237d6c841f Mon Sep 17 00:00:00 2001 From: Roman Tereshonkov Date: Wed, 3 Nov 2010 12:55:19 +0200 Subject: mtd: onenand: add option and variable for cache program feature A new option is added for cache program feature. A new variable ongoing is introduced for onenand_chip to handle the multi-command cache program operation. Signed-off-by: Roman Tereshonkov Acked-by: Kyungmin Park Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/onenand.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index 0c8815b..6da3fe3 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -137,6 +137,14 @@ struct onenand_chip { void *bbm; void *priv; + + /* + * Shows that the current operation is composed + * of sequence of commands. For example, cache program. + * Such command status OnGo bit is checked at the end of + * sequence. + */ + unsigned int ongoing; }; /* @@ -171,6 +179,9 @@ struct onenand_chip { #define ONENAND_IS_2PLANE(this) (0) #endif +#define ONENAND_IS_CACHE_PROGRAM(this) \ + (this->options & ONENAND_HAS_CACHE_PROGRAM) + /* Check byte access in OneNAND */ #define ONENAND_CHECK_BYTE_ACCESS(addr) (addr & 0x1) @@ -181,6 +192,7 @@ struct onenand_chip { #define ONENAND_HAS_UNLOCK_ALL (0x0002) #define ONENAND_HAS_2PLANE (0x0004) #define ONENAND_HAS_4KB_PAGE (0x0008) +#define ONENAND_HAS_CACHE_PROGRAM (0x0010) #define ONENAND_SKIP_UNLOCK_CHECK (0x0100) #define ONENAND_PAGEBUF_ALLOC (0x1000) #define ONENAND_OOBBUF_ALLOC (0x2000) -- cgit v1.1 From 65ab220c30bc2120eaa39753cadec68616e3f906 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 8 Nov 2010 11:17:54 +0530 Subject: mtd: fsmc.h: including linux/io.h for definition of readl Signed-off-by: Viresh Kumar Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/fsmc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h index 5d25567..e210b87 100644 --- a/include/linux/mtd/fsmc.h +++ b/include/linux/mtd/fsmc.h @@ -16,6 +16,7 @@ #ifndef __MTD_FSMC_H #define __MTD_FSMC_H +#include #include #include #include -- cgit v1.1 From cc31822250236ec173bb2aa149ebe2ba35405db2 Mon Sep 17 00:00:00 2001 From: Guillaume LECERF Date: Wed, 17 Nov 2010 12:35:50 +0100 Subject: mtd: cfi_fixup: remove unused 'param' parameter The 'param' parameter has never been used since its introduction, so simply remove it. Signed-off-by: Guillaume LECERF Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/cfi.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 4dd0c2c..a9baee6 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -527,8 +527,7 @@ struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t s struct cfi_fixup { uint16_t mfr; uint16_t id; - void (*fixup)(struct mtd_info *mtd, void* param); - void* param; + void (*fixup)(struct mtd_info *mtd); }; #define CFI_MFR_ANY 0xFFFF -- cgit v1.1 From 1534b8b09757190ce6e97fa97f9ad77c49082cd8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Nov 2010 15:02:21 -0800 Subject: mtd: fix nand kernel-doc warnings Warning(include/linux/mtd/nand.h:543): No description found for parameter 'badblockbits' Warning(drivers/mtd/nand/nand_bbt.c:1101): No description found for parameter 'mtd' Signed-off-by: Randy Dunlap Cc: David Woodhouse Cc: linux-mtd@lists.infradead.org Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/nand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 63e17d0..1f489b2 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -448,6 +448,8 @@ struct nand_buffers { * See the defines for further explanation. * @badblockpos: [INTERN] position of the bad block marker in the oob * area. + * @badblockbits: [INTERN] number of bits to left-shift the bad block + * number * @cellinfo: [INTERN] MLC/multichip data from chip ident * @numchips: [INTERN] number of physical chips * @chipsize: [INTERN] the size of one chip for multichip arrays -- cgit v1.1 From a7e93dcd9aacb3ef4acfcc4310577f3ae0741821 Mon Sep 17 00:00:00 2001 From: Roman Tereshonkov Date: Tue, 23 Nov 2010 14:17:17 +0200 Subject: mtd: fix master device identification for mtd repartition Function mtd_has_master renamed as mtd_is_partition to follow the function logic. The patch fixes the problem of checking the right mtd device for partition creation. To delete partition checking is not needed here so as it is done in mtd_del_partition. By master we consider the mtd device which does not belong to any partition. Signed-off-by: Roman Tereshonkov Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/partitions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index 2b54316..4a0a8ba 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -89,7 +89,7 @@ static inline int mtd_has_cmdlinepart(void) { return 1; } static inline int mtd_has_cmdlinepart(void) { return 0; } #endif -int mtd_is_master(struct mtd_info *mtd); +int mtd_is_partition(struct mtd_info *mtd); int mtd_add_partition(struct mtd_info *master, char *name, long long offset, long long length); int mtd_del_partition(struct mtd_info *master, int partno); -- cgit v1.1 From 593cd8711221c9661dbf9beb2fb42fecca03e693 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 29 Nov 2010 13:52:19 +0100 Subject: mtd: FSMC NAND use the PrimeCell identifier macros The FSMC actually has a standard ARM PrimeCell ID register, and the "revision" part of that register contains the thing that the code is looking at. Reuse the infrastructure from the AMBA bus abstraction and rid local defines. Signed-off-by: Linus Walleij Signed-off-by: David Woodhouse --- include/linux/mtd/fsmc.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h index e210b87..96e8e67 100644 --- a/include/linux/mtd/fsmc.h +++ b/include/linux/mtd/fsmc.h @@ -115,25 +115,6 @@ struct fsmc_regs { #define FSMC_THOLD_4 (4 << 16) #define FSMC_THIZ_1 (1 << 24) -/* peripid2 register definitions */ -#define FSMC_REVISION_MSK (0xf) -#define FSMC_REVISION_SHFT (0x4) - -#define FSMC_VER1 1 -#define FSMC_VER2 2 -#define FSMC_VER3 3 -#define FSMC_VER4 4 -#define FSMC_VER5 5 -#define FSMC_VER6 6 -#define FSMC_VER7 7 -#define FSMC_VER8 8 - -static inline uint32_t get_fsmc_version(struct fsmc_regs *regs) -{ - return (readl(®s->peripid2) >> FSMC_REVISION_SHFT) & - FSMC_REVISION_MSK; -} - /* * There are 13 bytes of ecc for every 512 byte block in FSMC version 8 * and it has to be read consecutively and immediately after the 512 -- cgit v1.1 From b5602e86432aaf0cc90dd207bf74e3a2bfb5078b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 29 Nov 2010 13:52:27 +0100 Subject: mtd: FSMC NAND fix obvious speling errors Fix spelling in the interface file. Signed-off-by: Linus Walleij Signed-off-by: David Woodhouse --- include/linux/mtd/fsmc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/fsmc.h b/include/linux/mtd/fsmc.h index 96e8e67..6987995 100644 --- a/include/linux/mtd/fsmc.h +++ b/include/linux/mtd/fsmc.h @@ -28,7 +28,7 @@ /* * The placement of the Command Latch Enable (CLE) and - * Address Latch Enable (ALE) is twised around in the + * Address Latch Enable (ALE) is twisted around in the * SPEAR310 implementation. */ #if defined(CONFIG_MACH_SPEAR310) @@ -63,7 +63,7 @@ struct fsmc_nor_bank_regs { /* ctrl_tim register definitions */ -struct fsms_nand_bank_regs { +struct fsmc_nand_bank_regs { uint32_t pc; uint32_t sts; uint32_t comm; @@ -79,7 +79,7 @@ struct fsms_nand_bank_regs { struct fsmc_regs { struct fsmc_nor_bank_regs nor_bank_regs[FSMC_MAX_NOR_BANKS]; uint8_t reserved_1[0x40 - 0x20]; - struct fsms_nand_bank_regs bank_regs[FSMC_MAX_NAND_BANKS]; + struct fsmc_nand_bank_regs bank_regs[FSMC_MAX_NAND_BANKS]; uint8_t reserved_2[0xfe0 - 0xc0]; uint32_t peripid0; /* 0xfe0 */ uint32_t peripid1; /* 0xfe4 */ -- cgit v1.1 From c9aa308fd5c373faeda588cfb02b04f116904613 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 7 Dec 2010 10:22:29 +0800 Subject: Add CPER PCIe error section structure and constants definition On some machine, PCIe error is reported via APEI (ACPI Platform Error Interface). The error data is passed from firmware to Linux via CPER PCIe error section structure. This patch adds CPER PCIe error section structure and constants definition. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- include/linux/cper.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index bf972f8..3104aaf 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -39,10 +39,12 @@ * Severity difinition for error_severity in struct cper_record_header * and section_severity in struct cper_section_descriptor */ -#define CPER_SEV_RECOVERABLE 0x0 -#define CPER_SEV_FATAL 0x1 -#define CPER_SEV_CORRECTED 0x2 -#define CPER_SEV_INFORMATIONAL 0x3 +enum { + CPER_SEV_RECOVERABLE, + CPER_SEV_FATAL, + CPER_SEV_CORRECTED, + CPER_SEV_INFORMATIONAL, +}; /* * Validation bits difinition for validation_bits in struct @@ -201,6 +203,47 @@ UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \ 0xDF, 0xAA, 0x84, 0xEC) +#define CPER_PROC_VALID_TYPE 0x0001 +#define CPER_PROC_VALID_ISA 0x0002 +#define CPER_PROC_VALID_ERROR_TYPE 0x0004 +#define CPER_PROC_VALID_OPERATION 0x0008 +#define CPER_PROC_VALID_FLAGS 0x0010 +#define CPER_PROC_VALID_LEVEL 0x0020 +#define CPER_PROC_VALID_VERSION 0x0040 +#define CPER_PROC_VALID_BRAND_INFO 0x0080 +#define CPER_PROC_VALID_ID 0x0100 +#define CPER_PROC_VALID_TARGET_ADDRESS 0x0200 +#define CPER_PROC_VALID_REQUESTOR_ID 0x0400 +#define CPER_PROC_VALID_RESPONDER_ID 0x0800 +#define CPER_PROC_VALID_IP 0x1000 + +#define CPER_MEM_VALID_ERROR_STATUS 0x0001 +#define CPER_MEM_VALID_PHYSICAL_ADDRESS 0x0002 +#define CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK 0x0004 +#define CPER_MEM_VALID_NODE 0x0008 +#define CPER_MEM_VALID_CARD 0x0010 +#define CPER_MEM_VALID_MODULE 0x0020 +#define CPER_MEM_VALID_BANK 0x0040 +#define CPER_MEM_VALID_DEVICE 0x0080 +#define CPER_MEM_VALID_ROW 0x0100 +#define CPER_MEM_VALID_COLUMN 0x0200 +#define CPER_MEM_VALID_BIT_POSITION 0x0400 +#define CPER_MEM_VALID_REQUESTOR_ID 0x0800 +#define CPER_MEM_VALID_RESPONDER_ID 0x1000 +#define CPER_MEM_VALID_TARGET_ID 0x2000 +#define CPER_MEM_VALID_ERROR_TYPE 0x4000 + +#define CPER_PCIE_VALID_PORT_TYPE 0x0001 +#define CPER_PCIE_VALID_VERSION 0x0002 +#define CPER_PCIE_VALID_COMMAND_STATUS 0x0004 +#define CPER_PCIE_VALID_DEVICE_ID 0x0008 +#define CPER_PCIE_VALID_SERIAL_NUMBER 0x0010 +#define CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS 0x0020 +#define CPER_PCIE_VALID_CAPABILITY 0x0040 +#define CPER_PCIE_VALID_AER_INFO 0x0080 + +#define CPER_PCIE_SLOT_SHIFT 3 + /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS @@ -306,6 +349,41 @@ struct cper_sec_mem_err { __u8 error_type; }; +struct cper_sec_pcie { + __u64 validation_bits; + __u32 port_type; + struct { + __u8 minor; + __u8 major; + __u8 reserved[2]; + } version; + __u16 command; + __u16 status; + __u32 reserved; + struct { + __u16 vendor_id; + __u16 device_id; + __u8 class_code[3]; + __u8 function; + __u8 device; + __u16 segment; + __u8 bus; + __u8 secondary_bus; + __u16 slot; + __u8 reserved; + } device_id; + struct { + __u32 lower; + __u32 upper; + } serial_number; + struct { + __u16 secondary_status; + __u16 control; + } bridge; + __u8 capability[60]; + __u8 aer_info[96]; +}; + /* Reset to default packing */ #pragma pack() -- cgit v1.1 From 16f4232ce4d6855361b4eb56262f4a202295c978 Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Wed, 8 Dec 2010 10:10:16 +0800 Subject: IPMI: Add one interface to get more info of low-level IPMI device The IPMI smi_watcher will be used to catch the IPMI interface as they come or go. In order to communicate with the correct IPMI device, it should be confirmed whether it is what we wanted especially on the system with multiple IPMI devices. But the new_smi callback function of smi_watcher provides very limited info(only the interface number and dev pointer) and there is no detailed info about the low level interface. For example: which mechansim registers the IPMI interface(ACPI, PCI, DMI and so on). This is to add one interface that can get more info of low-level IPMI device. For example: the ACPI device handle will be returned for the pnp_acpi IPMI device. Signed-off-by: Zhao Yakui Signed-off-by: Corey Minyard Signed-off-by: Len Brown --- include/linux/ipmi.h | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/ipmi_smi.h | 8 ++++++++ 2 files changed, 46 insertions(+) (limited to 'include') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 65aae34..045f2f2 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -454,6 +454,44 @@ unsigned int ipmi_addr_length(int addr_type); /* Validate that the given IPMI address is valid. */ int ipmi_validate_addr(struct ipmi_addr *addr, int len); +/* + * How did the IPMI driver find out about the device? + */ +enum ipmi_addr_src { + SI_INVALID = 0, SI_HOTMOD, SI_HARDCODED, SI_SPMI, SI_ACPI, SI_SMBIOS, + SI_PCI, SI_DEVICETREE, SI_DEFAULT +}; + +union ipmi_smi_info_union { + /* + * the acpi_info element is defined for the SI_ACPI + * address type + */ + struct { + void *acpi_handle; + } acpi_info; +}; + +struct ipmi_smi_info { + enum ipmi_addr_src addr_src; + + /* + * Base device for the interface. Don't forget to put this when + * you are done. + */ + struct device *dev; + + /* + * The addr_info provides more detailed info for some IPMI + * devices, depending on the addr_src. Currently only SI_ACPI + * info is provided. + */ + union ipmi_smi_info_union addr_info; +}; + +/* This is to get the private info of ipmi_smi_t */ +extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); + #endif /* __KERNEL__ */ diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 4b48318..906590a 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -39,6 +39,7 @@ #include #include #include +#include /* This files describes the interface for IPMI system management interface drivers to bind into the IPMI message handler. */ @@ -86,6 +87,13 @@ struct ipmi_smi_handlers { int (*start_processing)(void *send_info, ipmi_smi_t new_intf); + /* + * Get the detailed private info of the low level interface and store + * it into the structure of ipmi_smi_data. For example: the + * ACPI device handle will be returned for the pnp_acpi IPMI device. + */ + int (*get_smi_info)(void *send_info, struct ipmi_smi_info *data); + /* Called to enqueue an SMI message to be sent. This operation is not allowed to fail. If an error occurs, it should report back the error in a received message. It may -- cgit v1.1 From 7c96aef75949a56ec427fc6a2522dace2af33605 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Nov 2010 11:27:01 +1100 Subject: sunrpc: remove xpt_pool The xpt_pool field is only used for reporting BUGs. And it isn't used correctly. In particular, when it is cleared in svc_xprt_received before XPT_BUSY is cleared, there is no guarantee that either the compiler or the CPU might not re-order to two assignments, just setting xpt_pool to NULL after XPT_BUSY is cleared. If a different cpu were running svc_xprt_enqueue at this moment, it might see XPT_BUSY clear and then xpt_pool non-NULL, and so BUG. This could be fixed by calling smp_mb__before_clear_bit() before the clear_bit. However as xpt_pool isn't really used, it seems safest to simply remove xpt_pool. Another alternate would be to change the clear_bit to clear_bit_unlock, and the test_and_set_bit to test_and_set_bit_lock. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index aea0d438..c8f81da 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -63,7 +63,6 @@ struct svc_xprt { #define XPT_LISTENER 11 /* listening endpoint */ #define XPT_CACHE_AUTH 12 /* cache auth info */ - struct svc_pool *xpt_pool; /* current pool iff queued */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ struct mutex xpt_mutex; /* to serialize sending data */ -- cgit v1.1 From 04f4ad16b231abbfde34c762697ad035a3af0b5f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 16 Dec 2010 09:51:13 -0500 Subject: nfsd4: implement secinfo_no_name Implementation of this operation is mandatory for NFSv4.1. Signed-off-by: J. Bruce Fields --- include/linux/nfs4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 4925b22..26afa30 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -136,6 +136,9 @@ #define SEQ4_STATUS_CB_PATH_DOWN_SESSION 0x00000200 #define SEQ4_STATUS_BACKCHANNEL_FAULT 0x00000400 +#define NFS4_SECINFO_STYLE4_CURRENT_FH 0 +#define NFS4_SECINFO_STYLE4_PARENT 1 + #define NFS4_MAX_UINT64 (~(u64)0) /* An NFS4 sessions server must support at least NFS4_MAX_OPS operations. -- cgit v1.1 From c66ae9bb4dcaac78cc5e30d0ce7ff2bf3dcb48d9 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Mon, 13 Dec 2010 12:26:21 +0200 Subject: s3c_adc_battery: Add gpio_inverted field to pdata Add support for inverted gpio_charge_finished values. This change is necessary for H1940 support. Signed-off-by: Vasily Khoruzhick Signed-off-by: Anton Vorontsov --- include/linux/s3c_adc_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/s3c_adc_battery.h b/include/linux/s3c_adc_battery.h index dbce22f..fbe58b7 100644 --- a/include/linux/s3c_adc_battery.h +++ b/include/linux/s3c_adc_battery.h @@ -14,6 +14,7 @@ struct s3c_adc_bat_pdata { void (*disable_charger)(void); int gpio_charge_finished; + int gpio_inverted; const struct s3c_adc_bat_thresh *lut_noac; unsigned int lut_noac_cnt; -- cgit v1.1 From 00aaaef9a51a1a25c5d6d52ce510772f149a0eb0 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Nov 2010 15:46:54 +0800 Subject: PCI: MSI: Move MSI-X entry definition to pci_regs.h Then it can be used by others. Reviewed-by: Hidetoshi Seto Reviewed-by: Matthew Wilcox Signed-off-by: Sheng Yang Signed-off-by: Jesse Barnes --- include/linux/pci_regs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index af83076..b21d33e 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -309,6 +309,13 @@ #define PCI_MSIX_PBA 8 #define PCI_MSIX_FLAGS_BIRMASK (7 << 0) +/* MSI-X entry's format */ +#define PCI_MSIX_ENTRY_SIZE 16 +#define PCI_MSIX_ENTRY_LOWER_ADDR 0 +#define PCI_MSIX_ENTRY_UPPER_ADDR 4 +#define PCI_MSIX_ENTRY_DATA 8 +#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 + /* CompactPCI Hotswap Register */ #define PCI_CHSWP_CSR 2 /* Control and Status Register */ -- cgit v1.1 From 8d805286968811223cca002134ba3d81244d5313 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Nov 2010 15:46:55 +0800 Subject: PCI: Add mask bit definition for MSI-X table Then we can use it instead of magic number 1. Reviewed-by: Hidetoshi Seto Cc: Matthew Wilcox Signed-off-by: Sheng Yang Signed-off-by: Jesse Barnes --- include/linux/pci_regs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index b21d33e..d4f2c80 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -315,6 +315,7 @@ #define PCI_MSIX_ENTRY_UPPER_ADDR 4 #define PCI_MSIX_ENTRY_DATA 8 #define PCI_MSIX_ENTRY_VECTOR_CTRL 12 +#define PCI_MSIX_ENTRY_CTRL_MASKBIT 1 /* CompactPCI Hotswap Register */ -- cgit v1.1 From 2f671e2dbff6eb5ef4e2600adbec550c13b8fe72 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 6 Dec 2010 14:00:56 -0500 Subject: PCI: Disable ASPM if BIOS asks us to We currently refuse to touch the ASPM registers if the BIOS tells us that ASPM isn't supported. This can cause problems if the BIOS has (for any reason) enabled ASPM on some devices anyway. Change the code such that we explicitly clear ASPM if the FADT indicates that ASPM isn't supported, and make sure we tidy up appropriately on device removal in order to deal with the hotplug case. If ASPM is disabled because the BIOS doesn't hand over control then we won't touch the registers. Signed-off-by: Matthew Garrett Signed-off-by: Jesse Barnes --- include/linux/pci-aspm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h index 91ba0b3..ce681051 100644 --- a/include/linux/pci-aspm.h +++ b/include/linux/pci-aspm.h @@ -27,6 +27,7 @@ extern void pcie_aspm_init_link_state(struct pci_dev *pdev); extern void pcie_aspm_exit_link_state(struct pci_dev *pdev); extern void pcie_aspm_pm_state_change(struct pci_dev *pdev); extern void pci_disable_link_state(struct pci_dev *pdev, int state); +extern void pcie_clear_aspm(void); extern void pcie_no_aspm(void); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) @@ -41,7 +42,9 @@ static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) static inline void pci_disable_link_state(struct pci_dev *pdev, int state) { } - +static inline void pcie_clear_aspm(void) +{ +} static inline void pcie_no_aspm(void) { } -- cgit v1.1 From 1d3c16a818e992c199844954d95c17fd7ce6cbba Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Tue, 30 Nov 2010 17:43:26 -0600 Subject: PCI: make pci_restore_state return void pci_restore_state only ever returns 0, thus there is no benefit in having it return any value. Also, a large majority of the callers do not check the return code of pci_restore_state. Make the pci_restore_state a void return and avoid the overhead. Acked-by: Mauro Carvalho Chehab Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- include/linux/pci.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 7454408..63cbadc 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -806,7 +806,7 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size); /* Power management related routines */ int pci_save_state(struct pci_dev *dev); -int pci_restore_state(struct pci_dev *dev); +void pci_restore_state(struct pci_dev *dev); int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state); int pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); @@ -1168,10 +1168,8 @@ static inline int pci_save_state(struct pci_dev *dev) return 0; } -static inline int pci_restore_state(struct pci_dev *dev) -{ - return 0; -} +static inline void pci_restore_state(struct pci_dev *dev) +{ } static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state) { -- cgit v1.1 From 9b444b36fee16d2aaae9cc91ce594ecb15d922a9 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Wed, 17 Nov 2010 12:12:08 -0700 Subject: x86/PCI: irq and pci_ids patch for Intel Patsburg This patch adds an additional LPC Controller DeviceID for the Intel Patsburg PCH. Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- include/linux/pci_ids.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index cb845c16..d830106 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2468,7 +2468,8 @@ #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN 0x1c41 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX 0x1c5f #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS 0x1d22 -#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC 0x1d40 +#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0 0x1d40 +#define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1 0x1d41 #define PCI_DEVICE_ID_INTEL_82801AA_0 0x2410 #define PCI_DEVICE_ID_INTEL_82801AA_1 0x2411 #define PCI_DEVICE_ID_INTEL_82801AA_3 0x2413 -- cgit v1.1 From fe31e69740eddc7316071ed5165fed6703c8cd12 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 19 Dec 2010 15:57:16 +0100 Subject: PCI/PCIe: Clear Root PME Status bits early during system resume I noticed that PCI Express PMEs don't work on my Toshiba Portege R500 after the system has been woken up from a sleep state by a PME (through Wake-on-LAN). After some investigation it turned out that the BIOS didn't clear the Root PME Status bit in the root port that received the wakeup PME and since the Requester ID was also set in the port's Root Status register, any subsequent PMEs didn't trigger interrupts. This problem can be avoided by clearing the Root PME Status bits in all PCI Express root ports during early resume. For this purpose, add an early resume routine to the PCIe port driver and make this driver be always registered, even if pci_ports_disable is set (in which case the driver's only function is to provide the early resume callback). Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- include/linux/pci_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index d4f2c80..5b7e6b1 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -504,6 +504,8 @@ #define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ #define PCI_EXP_RTSTA 32 /* Root Status */ +#define PCI_EXP_RTSTA_PME 0x10000 /* PME status */ +#define PCI_EXP_RTSTA_PENDING 0x20000 /* PME pending */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ #define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ -- cgit v1.1 From bdd5f05d91e8ae68075b812ce244c918d3d752cd Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 4 Jan 2011 13:31:45 -0500 Subject: SUNRPC: Remove more code when NFSD_DEPRECATED is not configured Signed-off-by: NeilBrown [bfields@redhat.com: moved svcauth_unix_purge outside ifdef's.] Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/cache.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 6950c98..c2aebe8 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -255,10 +255,13 @@ static inline time_t get_expiry(char **bpp) return rv - boot.tv_sec; } +#ifdef CONFIG_NFSD_DEPRECATED static inline void sunrpc_invalidate(struct cache_head *h, struct cache_detail *detail) { h->expiry_time = seconds_since_boot() - 1; detail->nextcheck = seconds_since_boot(); } +#endif /* CONFIG_NFSD_DEPRECATED */ + #endif /* _LINUX_SUNRPC_CACHE_H_ */ -- cgit v1.1 From 9e701c610923aaeac8b38b9202a686d1cc9ee35d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 2 Jan 2011 21:56:36 -0500 Subject: svcrpc: simpler request dropping Currently we use -EAGAIN returns to determine when to drop a deferred request. On its own, that is error-prone, as it makes us treat -EAGAIN returns from other functions specially to prevent inadvertent dropping. So, use a flag on the request instead. Returning an error on request deferral is still required, to prevent further processing, but we no longer need worry that an error return on its own could result in a drop. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5a3085b..d45c482 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -269,6 +269,7 @@ struct svc_rqst { struct cache_req rq_chandle; /* handle passed to caches for * request delaying */ + bool rq_dropme; /* Catering to nfsd */ struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ -- cgit v1.1 From c45821d263a8a5109d69a9e8942b8d65bcd5f31a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 31 Oct 2010 00:04:44 -0400 Subject: locks: eliminate fl_mylease callback The nfs server only supports read delegations for now, so we don't care how conflicts are determined. All we care is that unlocks are recognized as matching the leases they are meant to remove. After the last patch, a comparison of struct files will work for that purpose. So we no longer need this callback. Signed-off-by: J. Bruce Fields --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 090f0ea..73ce69f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1059,7 +1059,6 @@ struct lock_manager_operations { int (*fl_grant)(struct file_lock *, struct file_lock *, int); void (*fl_release_private)(struct file_lock *); void (*fl_break)(struct file_lock *); - int (*fl_mylease)(struct file_lock *, struct file_lock *); int (*fl_change)(struct file_lock **, int); }; -- cgit v1.1 From 2ca72e17e5acb1052c35c9faba609c2289ce7a92 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 4 Jan 2011 17:37:15 -0500 Subject: nfsd4: move idmap and acl header files into fs/nfsd These are internal nfsd interfaces. Signed-off-by: J. Bruce Fields --- include/linux/nfs4_acl.h | 61 ------------------------------------------- include/linux/nfsd_idmap.h | 64 ---------------------------------------------- 2 files changed, 125 deletions(-) delete mode 100644 include/linux/nfs4_acl.h delete mode 100644 include/linux/nfsd_idmap.h (limited to 'include') diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h deleted file mode 100644 index c9c05a7..0000000 --- a/include/linux/nfs4_acl.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * include/linux/nfs4_acl.c - * - * Common NFSv4 ACL handling definitions. - * - * Copyright (c) 2002 The Regents of the University of Michigan. - * All rights reserved. - * - * Marius Aamodt Eriksen - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef LINUX_NFS4_ACL_H -#define LINUX_NFS4_ACL_H - -#include - -/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to - * fit in a page: */ -#define NFS4_ACL_MAX 170 - -struct nfs4_acl *nfs4_acl_new(int); -int nfs4_acl_get_whotype(char *, u32); -int nfs4_acl_write_who(int who, char *p); -int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, - uid_t who, u32 mask); - -#define NFS4_ACL_TYPE_DEFAULT 0x01 -#define NFS4_ACL_DIR 0x02 -#define NFS4_ACL_OWNER 0x04 - -struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, - struct posix_acl *, unsigned int flags); -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, - struct posix_acl **, unsigned int flags); - -#endif /* LINUX_NFS4_ACL_H */ diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h deleted file mode 100644 index d4a2ac1..0000000 --- a/include/linux/nfsd_idmap.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * include/linux/nfsd_idmap.h - * - * Mapping of UID to name and vice versa. - * - * Copyright (c) 2002, 2003 The Regents of the University of - * Michigan. All rights reserved. -> * - * Marius Aamodt Eriksen - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef LINUX_NFSD_IDMAP_H -#define LINUX_NFSD_IDMAP_H - -#include -#include - -/* XXX from linux/nfs_idmap.h */ -#define IDMAP_NAMESZ 128 - -#ifdef CONFIG_NFSD_V4 -int nfsd_idmap_init(void); -void nfsd_idmap_shutdown(void); -#else -static inline int nfsd_idmap_init(void) -{ - return 0; -} -static inline void nfsd_idmap_shutdown(void) -{ -} -#endif - -int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); -int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); -int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); - -#endif /* LINUX_NFSD_IDMAP_H */ -- cgit v1.1 From 91aa5fadb831e7b6ea473a526a6b49c6dc4819ce Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:31:04 +0000 Subject: ARM: PL08x: fix atomic_t usage and tx_submit() return value range The last_issued variable uses an atomic type, which is only incremented inside a protected region, and then read. Everywhere else only reads the value, so it isn't using atomic_t correctly, and it doesn't even need to. Moreover, the DMA engine code provides us with a variable for this already - chan.cookie. Use chan.cookie instead. Also, avoid negative dma_cookie_t values - negative returns from tx_submit() mean failure, yet in reality we always succeed. Restart from cookie 1, just like other DMA engine drivers do. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 521a0f8..4ae62b4 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -174,7 +174,6 @@ struct pl08x_dma_chan { struct pl08x_channel_data *cd; dma_addr_t runtime_addr; enum dma_data_direction runtime_direction; - atomic_t last_issued; dma_cookie_t lc; struct list_head desc_list; struct pl08x_txd *at; -- cgit v1.1 From 7cb72ad959b16ac594118977b7954a7d2ec7a052 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:35:28 +0000 Subject: ARM: PL08x: avoid 'void *' struct fields when we can type them properly Avoid using 'void *' struct fields when the structs are not defined in linux/amba/pl08x.h - instead, forward declare the struct names, and use these instead. This ensures we have proper typechecking. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 4ae62b4..3ecc20f 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -22,6 +22,9 @@ #include #include +struct pl08x_lli; +struct pl08x_driver_data; + /** * struct pl08x_channel_data - data structure to pass info between * platform and PL08x driver regarding channel configuration @@ -179,7 +182,7 @@ struct pl08x_dma_chan { struct pl08x_txd *at; unsigned long lockflags; spinlock_t lock; - void *host; + struct pl08x_driver_data *host; enum pl08x_dma_chan_state state; bool slave; struct pl08x_txd *waiting; -- cgit v1.1 From cace658572ba5d1075f3891e823130a66f3e330f Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:37:31 +0000 Subject: ARM: PL08x: use 'size_t' for lengths Use size_t for variables denoting lengths throughout, and use the 'z' qualifier for printing the value. For safety, add a BUG_ON() in pl08x_fill_lli_for_desc() to catch the remainder potentially becoming negative. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 3ecc20f..2c834ed 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -77,7 +77,7 @@ struct pl08x_bus_data { dma_addr_t addr; u8 maxwidth; u8 buswidth; - u32 fill_bytes; + size_t fill_bytes; }; /** @@ -113,7 +113,7 @@ struct pl08x_txd { enum dma_data_direction direction; struct pl08x_bus_data srcbus; struct pl08x_bus_data dstbus; - int len; + size_t len; dma_addr_t llis_bus; void *llis_va; struct pl08x_channel_data *cd; -- cgit v1.1 From 19524d77ec34faf58d313ba34fb755ef6e159216 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:39:13 +0000 Subject: ARM: PL08x: avoid duplicating registers in txd and phychan structures As we now have all the code accessing the phychan {csrc,cdst,clli,cctl, ccfg} members in one function, there's no point storing the data into the struct. Get rid of the struct members. Re-order the register dump in the dev_dbg() to reflect the order we write the registers to the DMA device. The txd {csrc,cdst,clli,cctl} values are duplicates of the lli[0] values, so there's no point duplicating these either. Program the DMAC registers directly from the lli[0] values. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 2c834ed..29d9745 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -95,11 +95,6 @@ struct pl08x_phy_chan { spinlock_t lock; int signal; struct pl08x_dma_chan *serving; - u32 csrc; - u32 cdst; - u32 clli; - u32 cctl; - u32 ccfg; }; /** @@ -118,14 +113,6 @@ struct pl08x_txd { void *llis_va; struct pl08x_channel_data *cd; bool active; - /* - * Settings to be put into the physical channel when we - * trigger this txd - */ - u32 csrc; - u32 cdst; - u32 clli; - u32 cctl; }; /** -- cgit v1.1 From 4983a04fd2562986360b646b378f267308bc22c0 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:39:33 +0000 Subject: ARM: PL08x: move ccfg into txd structure The ccfg register is used to configure the channel parameters - the type and direction of transfer, the flow control signal and IRQ mask enables. The type and direction of transfer is known in the relevent prep_* function where a txd is created. The IRQ mask enables are always set, and the flow control signals are always set when we start processing a txd according to phychan->signal. If we store the ccfg value in the txd structure, we can avoid modifying platform data - and even having it in platform data at all. So, remove it from platform data too. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 29d9745..8e74cb1 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -58,7 +58,6 @@ struct pl08x_channel_data { int max_signal; u32 muxval; u32 cctl; - u32 ccfg; dma_addr_t addr; bool circular_buffer; bool single; @@ -113,6 +112,11 @@ struct pl08x_txd { void *llis_va; struct pl08x_channel_data *cd; bool active; + /* + * Settings to be put into the physical channel when we + * trigger this txd. Other registers are in llis_va[0]. + */ + u32 ccfg; }; /** -- cgit v1.1 From 70b5ed6b6d72cd8b1a3d4b7b878a0dd132bec7ba Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:40:13 +0000 Subject: ARM: PL08x: move default cctl into txd structure Rather than modifying platform data while preparing a transfer, copy the cctl value into the txd structure and modify the value there. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 8e74cb1..8d90830 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -110,8 +110,9 @@ struct pl08x_txd { size_t len; dma_addr_t llis_bus; void *llis_va; - struct pl08x_channel_data *cd; bool active; + /* Default cctl value for LLIs */ + u32 cctl; /* * Settings to be put into the physical channel when we * trigger this txd. Other registers are in llis_va[0]. -- cgit v1.1 From 30749cb4a40f02a199640011e5ab5c5f60b8482e Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:41:13 +0000 Subject: ARM: PL08x: allow AHB master port selection to be configured Platforms need to be able to control which AHB master interface is used, as each AHB master interface may be asymetric. Allow the interfaces used for fetching LLIs, memory, and each peripheral to be configured individually. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 8d90830..f858651 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -25,6 +25,12 @@ struct pl08x_lli; struct pl08x_driver_data; +/* Bitmasks for selecting AHB ports for DMA transfers */ +enum { + PL08X_AHB1 = (1 << 0), + PL08X_AHB2 = (1 << 1) +}; + /** * struct pl08x_channel_data - data structure to pass info between * platform and PL08x driver regarding channel configuration @@ -51,6 +57,8 @@ struct pl08x_driver_data; * round round round) * @single: the device connected to this channel will request single * DMA transfers, not bursts. (Bursts are default.) + * @periph_buses: the device connected to this channel is accessible via + * these buses (use PL08X_AHB1 | PL08X_AHB2). */ struct pl08x_channel_data { char *bus_id; @@ -61,6 +69,7 @@ struct pl08x_channel_data { dma_addr_t addr; bool circular_buffer; bool single; + u8 periph_buses; }; /** @@ -193,8 +202,8 @@ struct pl08x_dma_chan { * less than zero, else it returns the allocated signal number * @put_signal: indicate to the platform that this physical signal is not * running any DMA transfer and multiplexing can be recycled - * @bus_bit_lli: Bit[0] of the address indicated which AHB bus master the - * LLI addresses are on 0/1 Master 1/2. + * @lli_buses: buses which LLIs can be fetched from: PL08X_AHB1 | PL08X_AHB2 + * @mem_buses: buses which memory can be accessed from: PL08X_AHB1 | PL08X_AHB2 */ struct pl08x_platform_data { struct pl08x_channel_data *slave_channels; @@ -202,6 +211,8 @@ struct pl08x_platform_data { struct pl08x_channel_data memcpy_channel; int (*get_signal)(struct pl08x_dma_chan *); void (*put_signal)(struct pl08x_dma_chan *); + u8 lli_buses; + u8 mem_buses; }; #ifdef CONFIG_AMBA_PL08X -- cgit v1.1 From d7244e9a27a3da27d62aabf560ee828d7991493e Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:43:35 +0000 Subject: ARM: PL08x: shrink srcbus/dstbus in txd structure We only need to store the dma address. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index f858651..4e9e718 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -114,8 +114,8 @@ struct pl08x_txd { struct dma_async_tx_descriptor tx; struct list_head node; enum dma_data_direction direction; - struct pl08x_bus_data srcbus; - struct pl08x_bus_data dstbus; + dma_addr_t src_addr; + dma_addr_t dst_addr; size_t len; dma_addr_t llis_bus; void *llis_va; -- cgit v1.1 From 15c17232fbd1f7687c740c3c26f9e7f337bd9e36 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:44:36 +0000 Subject: ARM: PL08x: rename 'desc_list' as 'pend_list' This 'desc_list' is actually a list of pending descriptors, so name it after its function (pending list) rather than what it contains (descriptors). Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 4e9e718..08a9024 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -158,7 +158,7 @@ enum pl08x_dma_chan_state { * @runtime_direction: current direction of this channel according to * runtime config * @lc: last completed transaction on this channel - * @desc_list: queued transactions pending on this channel + * @pend_list: queued transactions pending on this channel * @at: active transaction on this channel * @lockflags: sometimes we let a lock last between two function calls, * especially prep/submit, and then we need to store the IRQ flags @@ -179,7 +179,7 @@ struct pl08x_dma_chan { dma_addr_t runtime_addr; enum dma_data_direction runtime_direction; dma_cookie_t lc; - struct list_head desc_list; + struct list_head pend_list; struct pl08x_txd *at; unsigned long lockflags; spinlock_t lock; -- cgit v1.1 From 8087aacda040bdbf84940712d132ce80c30b9d5d Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:45:17 +0000 Subject: ARM: PL08x: introduce 'phychan_hold' to hold on to physical channels Introduce 'phychan_hold' to hold on to physical DMA channels while we're preparing a new descriptor for it. This will be incremented when we allocate a physical channel and set the MUX registers during the preparation of the TXD, and will only be decremented when the TXD is submitted. This prevents the physical channel being given up before the new TXD is placed on the queue. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 08a9024..95b76ea 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -151,6 +151,8 @@ enum pl08x_dma_chan_state { * struct pl08x_dma_chan - this structure wraps a DMA ENGINE channel * @chan: wrappped abstract channel * @phychan: the physical channel utilized by this channel, if there is one + * @phychan_hold: if non-zero, hold on to the physical channel even if we + * have no pending entries * @tasklet: tasklet scheduled by the IRQ to handle actual work etc * @name: name of channel * @cd: channel platform data @@ -173,6 +175,7 @@ enum pl08x_dma_chan_state { struct pl08x_dma_chan { struct dma_chan chan; struct pl08x_phy_chan *phychan; + int phychan_hold; struct tasklet_struct tasklet; char *name; struct pl08x_channel_data *cd; -- cgit v1.1 From c370e594efe2993620d24d41a78f325102e99d1c Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Mon, 3 Jan 2011 22:45:37 +0000 Subject: ARM: PL08x: fix locking between prepare function and submit function The PL08x driver holds on to the channel lock with interrupts disabled between the prepare and the subsequent submit API functions. This means that the locking state when the prepare function returns is dependent on whether it suceeeds or not. It did this to ensure that the physical channel wasn't released, and as it used to add the descriptor onto the pending list at prepare time rather than submit time. Now that we have reorganized the code to remove those reasons, we can now safely release the spinlock at the end of preparation and reacquire it in our submit function. Signed-off-by: Russell King Acked-by: Linus Walleij Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 95b76ea..933b4ed 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -162,9 +162,6 @@ enum pl08x_dma_chan_state { * @lc: last completed transaction on this channel * @pend_list: queued transactions pending on this channel * @at: active transaction on this channel - * @lockflags: sometimes we let a lock last between two function calls, - * especially prep/submit, and then we need to store the IRQ flags - * in the channel state, here * @lock: a lock for this channel data * @host: a pointer to the host (internal use) * @state: whether the channel is idle, paused, running etc @@ -184,7 +181,6 @@ struct pl08x_dma_chan { dma_cookie_t lc; struct list_head pend_list; struct pl08x_txd *at; - unsigned long lockflags; spinlock_t lock; struct pl08x_driver_data *host; enum pl08x_dma_chan_state state; -- cgit v1.1 From cf24dc85ff29a41abd8e73730e5feb22b2666bd3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 19 Feb 2010 15:39:52 +0100 Subject: mtd: OneNAND: add enable / disable methods to onenand_chip Add enable / disable methods called from get_device() / release_device(). These can be used, for example, to allow the driver to prevent the voltage regulator from being put to sleep while OneNAND is in use. Signed-off-by: Adrian Hunter Signed-off-by: David Woodhouse --- include/linux/mtd/onenand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index 6da3fe3..ae418e4 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -118,6 +118,8 @@ struct onenand_chip { int (*chip_probe)(struct mtd_info *mtd); int (*block_markbad)(struct mtd_info *mtd, loff_t ofs); int (*scan_bbt)(struct mtd_info *mtd); + int (*enable)(struct mtd_info *mtd); + int (*disable)(struct mtd_info *mtd); struct completion complete; int irq; -- cgit v1.1 From 0e4ca7e5101e7f4054452b8d71c535eec64a187b Mon Sep 17 00:00:00 2001 From: Anatolij Gustschin Date: Thu, 16 Dec 2010 23:42:14 +0100 Subject: mtd: add writebufsize field to mtd_info struct This field will be used to indicate the write buffer size of the MTD device. UBI will set it's minimal I/O unit size (min_io_size) to the indicated write buffer size. By this change we intend to fix failed recovery of UBIFS partitions we currently observe on NOR flash when mounting the partition after unclean unmount. Currently the min_io_size is set to mtd->writesize (which is 1 byte for NOR flash). But flash programming is often done from prepared write buffer containing multiple bytes and is performed in one programming operation which could be interrupted by a power cut or a system reset causing corrupted (partially written) areas in a flash sector. Knowing the size of potentially corrupted areas UBIFS scanning and recovery algorithms are able to perform successful recovery. In case of NOR flash minimal I/O size must be equal to the maximal size of the write buffer used by embedded flash programming algorithm. In case of NAND flash mtd->writebufsize should be equivalent to mtd->writesize. The subsequent patches will add mtd->writebufsize initialization where needed. Signed-off-by: Anatolij Gustschin Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/mtd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index fe8d77e..9d5306b 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -144,6 +144,17 @@ struct mtd_info { */ uint32_t writesize; + /* + * Size of the write buffer used by the MTD. MTD devices having a write + * buffer can write multiple writesize chunks at a time. E.g. while + * writing 4 * writesize bytes to a device with 2 * writesize bytes + * buffer the MTD driver can (but doesn't have to) do 2 writesize + * operations, but not 4. Currently, all NANDs have writebufsize + * equivalent to writesize (NAND page size). Some NOR flashes do have + * writebufsize greater than writesize. + */ + uint32_t writebufsize; + uint32_t oobsize; // Amount of OOB data per block (e.g. 16) uint32_t oobavail; // Available OOB bytes per block -- cgit v1.1 From 26fcaf60fe3861409eb4c455c5c0d0f00f599b08 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 7 Jan 2011 01:42:31 +0100 Subject: PM: Fix oops in suspend/hibernate code related to failing ioremap() When ioremap() fails (which might happen for some reason), we nicely oops in suspend_nvs_save() due to NULL dereference by memcpy() in there. Fail gracefully instead. Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/linux/suspend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 2669751..acb7d91 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -262,7 +262,7 @@ static inline bool system_entering_hibernation(void) { return false; } extern int suspend_nvs_register(unsigned long start, unsigned long size); extern int suspend_nvs_alloc(void); extern void suspend_nvs_free(void); -extern void suspend_nvs_save(void); +extern int suspend_nvs_save(void); extern void suspend_nvs_restore(void); #else /* CONFIG_SUSPEND_NVS */ static inline int suspend_nvs_register(unsigned long a, unsigned long b) @@ -271,7 +271,7 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b) } static inline int suspend_nvs_alloc(void) { return 0; } static inline void suspend_nvs_free(void) {} -static inline void suspend_nvs_save(void) {} +static inline int suspend_nvs_save(void) {} static inline void suspend_nvs_restore(void) {} #endif /* CONFIG_SUSPEND_NVS */ -- cgit v1.1 From 976513dbfc1547c7b1822566923058655f0c32fd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jan 2011 01:43:44 +0100 Subject: PM / ACPI: Move NVS saving and restoring code to drivers/acpi The saving of the ACPI NVS area during hibernation and suspend and restoring it during the subsequent resume is entirely specific to ACPI, so move it to drivers/acpi and drop the CONFIG_SUSPEND_NVS configuration option which is redundant. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/linux/acpi.h | 9 +++++++++ include/linux/suspend.h | 17 ----------------- 2 files changed, 9 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 67c91b4..fa7ed6a98 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -254,6 +254,15 @@ void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); #endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_ACPI_SLEEP +int suspend_nvs_register(unsigned long start, unsigned long size); +#else +static inline int suspend_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +#endif + struct acpi_osc_context { char *uuid_str; /* uuid string */ int rev; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index acb7d91..0e288e3 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -258,23 +258,6 @@ static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ -#ifdef CONFIG_SUSPEND_NVS -extern int suspend_nvs_register(unsigned long start, unsigned long size); -extern int suspend_nvs_alloc(void); -extern void suspend_nvs_free(void); -extern int suspend_nvs_save(void); -extern void suspend_nvs_restore(void); -#else /* CONFIG_SUSPEND_NVS */ -static inline int suspend_nvs_register(unsigned long a, unsigned long b) -{ - return 0; -} -static inline int suspend_nvs_alloc(void) { return 0; } -static inline void suspend_nvs_free(void) {} -static inline int suspend_nvs_save(void) {} -static inline void suspend_nvs_restore(void) {} -#endif /* CONFIG_SUSPEND_NVS */ - #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); -- cgit v1.1 From 7fa69baf29de8c77a6b32c054df2abb8f11f8aa4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jan 2011 23:35:10 +0100 Subject: ACPI / PM: Drop special ACPI wakeup flags Drop special ACPI wakeup flags, wakeup.state.enabled and wakeup.flags.always_enabled, that aren't necessary any more after we've started to use standard device wakeup flags for handling ACPI wakeup devices. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 359ef11..f2499f5 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -242,20 +242,14 @@ struct acpi_device_perf { struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 run_wake:1; /* Run-Wake GPE devices */ - u8 always_enabled:1; /* Run-wake devices that are always enabled */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; -struct acpi_device_wakeup_state { - u8 enabled:1; -}; - struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct acpi_handle_list resources; - struct acpi_device_wakeup_state state; struct acpi_device_wakeup_flags flags; int prepare_count; int run_wake_count; -- cgit v1.1 From 5a344a505093dd65f82f338ffdb7208321b3630e Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Mon, 10 Jan 2011 16:35:45 +0800 Subject: ACPI: Reevaluate whether the T-state is supported or not after cpu is online/offline After one CPU is offlined, it is unnecessary to switch T-state for it. So it will be better that the throttling is disabled after the cpu is offline. At the same time after one cpu is online, we should check whether the T-state is supported and then set the corresponding T-state flag. Signed-off-by: Zhao Yakui Signed-off-by: Len Brown --- include/acpi/processor.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 1b62102..55192ac 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -324,6 +324,12 @@ int acpi_processor_tstate_has_changed(struct acpi_processor *pr); int acpi_processor_get_throttling_info(struct acpi_processor *pr); extern int acpi_processor_set_throttling(struct acpi_processor *pr, int state, bool force); +/* + * Reevaluate whether the T-state is invalid after one cpu is + * onlined/offlined. In such case the flags.throttling will be updated. + */ +extern void acpi_processor_reevaluate_tstate(struct acpi_processor *pr, + unsigned long action); extern const struct file_operations acpi_processor_throttling_fops; extern void acpi_processor_throttling_init(void); /* in processor_idle.c */ -- cgit v1.1 From 4976b4eb9d083f035aa97afec560c7e1c16c6afd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 4 Jan 2011 13:02:32 +0100 Subject: mac80211: add remain-on-channel docs Add documentation for the new callbacks that I forgot in the patch adding the callbacks. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/mac80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5b3fd5a..3ce8e1f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1753,6 +1753,16 @@ enum ieee80211_ampdu_mlme_action { * (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX). * * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant). + * + * @remain_on_channel: Starts an off-channel period on the given channel, must + * call back to ieee80211_ready_on_channel() when on that channel. Note + * that normal channel traffic is not stopped as this is intended for hw + * offload. Frames to transmit on the off-channel channel are transmitted + * normally except for the %IEEE80211_TX_CTL_TX_OFFCHAN flag. When the + * duration (which will always be non-zero) expires, the driver must call + * ieee80211_remain_on_channel_expired(). This callback may sleep. + * @cancel_remain_on_channel: Requests that an ongoing off-channel period is + * aborted before it expires. This callback may sleep. */ struct ieee80211_ops { int (*tx)(struct ieee80211_hw *hw, struct sk_buff *skb); -- cgit v1.1 From 610dbc980f7ad886313278ce946287f24b44cf55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 Jan 2011 22:36:44 +0100 Subject: mac80211: add missing docs for off-chan TX flag The flag is IEEE80211_TX_CTL_TX_OFFCHAN and I had added that in a previous patch but forgotten docs. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3ce8e1f..62c0ce2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -337,6 +337,10 @@ struct ieee80211_bss_conf { * @IEEE80211_TX_CTL_LDPC: tells the driver to use LDPC for this frame * @IEEE80211_TX_CTL_STBC: Enables Space-Time Block Coding (STBC) for this * frame and selects the maximum number of streams that it can use. + * @IEEE80211_TX_CTL_TX_OFFCHAN: Marks this packet to be transmitted on + * the off-channel channel when a remain-on-channel offload is done + * in hardware -- normal packets still flow and are expected to be + * handled properly by the device. * * Note: If you have to add new flags to the enumeration, then don't * forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary. -- cgit v1.1 From f52555a4b2d229079155e6642ec09afa63d10cab Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 Jan 2011 22:36:45 +0100 Subject: cfg80211: add mesh join/leave callback docs When I made the patch to add mesh join/leave I didn't pay attention to docs because it was a proof of concept, and then when we actually did merge it I forgot -- add docs now. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bcc9f44..1322695 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1103,6 +1103,8 @@ struct cfg80211_pmksa { * @change_mpath: change a given mesh path * @get_mpath: get a mesh path for the given parameters * @dump_mpath: dump mesh path callback -- resume dump at index @idx + * @join_mesh: join the mesh network with the specified parameters + * @leave_mesh: leave the current mesh network * * @get_mesh_config: Get the current mesh configuration * -- cgit v1.1 From 45007fd590c6b099cec5d36ea7056b00f8b4916a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 Jan 2011 22:36:46 +0100 Subject: nl80211: add/fix mesh docs Some mesh attribute/command docs are missing or have errors in the name so they don't match, fix all of them. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/nl80211.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h index 2b89b71..821ffb9 100644 --- a/include/linux/nl80211.h +++ b/include/linux/nl80211.h @@ -148,6 +148,10 @@ * @NL80211_CMD_SET_MPATH: Set mesh path attributes for mesh path to * destination %NL80211_ATTR_MAC on the interface identified by * %NL80211_ATTR_IFINDEX. + * @NL80211_CMD_NEW_MPATH: Create a new mesh path for the destination given by + * %NL80211_ATTR_MAC via %NL80211_ATTR_MPATH_NEXT_HOP. + * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by + * %NL80211_ATTR_MAC. * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the * the interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC @@ -612,7 +616,7 @@ enum nl80211_commands { * consisting of a nested array. * * @NL80211_ATTR_MESH_ID: mesh id (1-32 bytes). - * @NL80211_ATTR_PLINK_ACTION: action to perform on the mesh peer link. + * @NL80211_ATTR_STA_PLINK_ACTION: action to perform on the mesh peer link. * @NL80211_ATTR_MPATH_NEXT_HOP: MAC address of the next hop for a mesh path. * @NL80211_ATTR_MPATH_INFO: information about a mesh_path, part of mesh path * info given for %NL80211_CMD_GET_MPATH, nested attribute described at @@ -879,7 +883,9 @@ enum nl80211_commands { * See &enum nl80211_key_default_types. * * @NL80211_ATTR_MESH_SETUP: Optional mesh setup parameters. These cannot be - * changed once the mesh is active. + * changed once the mesh is active. + * @NL80211_ATTR_MESH_CONFIG: Mesh configuration parameters, a nested attribute + * containing attributes from &enum nl80211_meshconf_params. * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -1225,8 +1231,6 @@ enum nl80211_rate_info { * @NL80211_STA_INFO_INACTIVE_TIME: time since last activity (u32, msecs) * @NL80211_STA_INFO_RX_BYTES: total received bytes (u32, from this station) * @NL80211_STA_INFO_TX_BYTES: total transmitted bytes (u32, to this station) - * @__NL80211_STA_INFO_AFTER_LAST: internal - * @NL80211_STA_INFO_MAX: highest possible station info attribute * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm) * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute * containing info as possible, see &enum nl80211_sta_info_txrate. @@ -1236,6 +1240,11 @@ enum nl80211_rate_info { * @NL80211_STA_INFO_TX_RETRIES: total retries (u32, to this station) * @NL80211_STA_INFO_TX_FAILED: total failed packets (u32, to this station) * @NL80211_STA_INFO_SIGNAL_AVG: signal strength average (u8, dBm) + * @NL80211_STA_INFO_LLID: the station's mesh LLID + * @NL80211_STA_INFO_PLID: the station's mesh PLID + * @NL80211_STA_INFO_PLINK_STATE: peer link state for the station + * @__NL80211_STA_INFO_AFTER_LAST: internal + * @NL80211_STA_INFO_MAX: highest possible station info attribute */ enum nl80211_sta_info { __NL80211_STA_INFO_INVALID, @@ -1626,7 +1635,7 @@ enum nl80211_mntr_flags { * @NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME: The interval of time (in TUs) * that it takes for an HWMP information element to propagate across the mesh * - * @NL80211_MESHCONF_ROOTMODE: whether root mode is enabled or not + * @NL80211_MESHCONF_HWMP_ROOTMODE: whether root mode is enabled or not * * @NL80211_MESHCONF_ELEMENT_TTL: specifies the value of TTL field set at a * source mesh point for path selection elements. @@ -1678,6 +1687,7 @@ enum nl80211_meshconf_params { * element that vendors will use to identify the path selection methods and * metrics in use. * + * @NL80211_MESH_SETUP_ATTR_MAX: highest possible mesh setup attribute number * @__NL80211_MESH_SETUP_ATTR_AFTER_LAST: Internal use */ enum nl80211_mesh_setup_params { -- cgit v1.1 From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 9 Jan 2011 23:32:15 +0100 Subject: workqueue: relax lockdep annotation on flush_work() Currently, the lockdep annotation in flush_work() requires exclusive access on the workqueue the target work is queued on and triggers warning if a work is trying to flush another work on the same workqueue; however, this is no longer true as workqueues can now execute multiple works concurrently. This patch adds lock_map_acquire_read() and make process_one_work() hold read access to the workqueue while executing a work and start_flush_work() check for write access if concurrnecy level is one or the workqueue has a rescuer (as only one execution resource - the rescuer - is guaranteed to be available under memory pressure), and read access if higher. This better represents what's going on and removes spurious lockdep warnings which are triggered by fake dependency chain created through flush_work(). * Peter pointed out that flushing another work from a WQ_MEM_RECLAIM wq breaks forward progress guarantee under memory pressure. Condition check accordingly updated. Signed-off-by: Tejun Heo Reported-by: "Rafael J. Wysocki" Tested-by: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: stable@kernel.org --- include/linux/lockdep.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 71c09b26..9f19430 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -522,12 +522,15 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) +# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) # else # define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) +# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) # endif # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) #else # define lock_map_acquire(l) do { } while (0) +# define lock_map_acquire_read(l) do { } while (0) # define lock_map_release(l) do { } while (0) #endif -- cgit v1.1 From 925268a06dc2b1ff7bfcc37419a6827a0e739639 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 11 Jan 2011 16:44:01 +0900 Subject: memory hotplug: one more lock on memory hotplug Now, memory_hotplug_(un)lock() is used for add/remove/offline pages for avoiding races with hibernation. But this should be held in online_pages(), too. It seems asymmetric. There are cases where one has to avoid a race with memory hotplug notifier and his own local code, and hotplug v.s. hotplug. This will add a generic solution for avoiding races. In other view, having lock here has no big impacts. online pages is tend to be done by udev script at el against each memory section one by one. Then, it's better to have lock here, too. Cc: # 2.6.37 Reviewed-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Pekka Enberg --- include/linux/memory_hotplug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a..12b9eb5a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -161,6 +161,12 @@ extern void register_page_bootmem_info_node(struct pglist_data *pgdat); extern void put_page_bootmem(struct page *page); #endif +/* + * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug + * notifier will be called under this. 2) offline/online/add/remove memory + * will not run simultaneously. + */ + void lock_memory_hotplug(void); void unlock_memory_hotplug(void); -- cgit v1.1 From f07745325cbd93afc5d8bcf7539a063d33134075 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:49 -0800 Subject: xen: define gnttab_set_map_op/unmap_op Impact: hypercall definitions These functions populate the gnttab data structures used by the granttab map and unmap ops and are used in the backend drivers. Originally xen-unstable.hg 9625:c3bb51c443a7 [ Include Stefano's fix for phys_addr_t ] Signed-off-by: Ian Campbell Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/grant_table.h | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 9a73170..1821aa1 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -37,10 +37,16 @@ #ifndef __ASM_GNTTAB_H__ #define __ASM_GNTTAB_H__ -#include +#include + +#include #include + +#include #include +#include + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ #define NR_GRANT_FRAMES 4 @@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, struct grant_entry **__shared); -- cgit v1.1 From ab31523c2fcac557226bac72cbdf5fafe01f9a26 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 14 Dec 2010 18:40:46 +0000 Subject: xen/gntdev: allow usermode to map granted pages The gntdev driver allows usermode to map granted pages from other domains. This is typically used to implement a Xen backend driver in user mode. Signed-off-by: Gerd Hoffmann Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/gntdev.h | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 include/xen/gntdev.h (limited to 'include') diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h new file mode 100644 index 0000000..eb23f41 --- /dev/null +++ b/include/xen/gntdev.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ -- cgit v1.1 From 289b777eac19c811b474593b4d2fd14e46340c23 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 10 Dec 2010 14:54:44 +0000 Subject: xen: introduce gnttab_map_refs and gnttab_unmap_refs gnttab_map_refs maps some grant refs and uses the new m2p override to set a proper m2p mapping for the granted pages. gnttab_unmap_refs unmaps the granted refs and removes th mappings from the m2p override. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/grant_table.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 1821aa1..b1fab6b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -155,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count); +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count); + #endif /* __ASM_GNTTAB_H__ */ -- cgit v1.1 From 1d1bc8f2074f0b728dfca2a3c16f2f5a3f298ffc Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 4 Oct 2010 23:12:59 -0400 Subject: nfsd4: support BIND_CONN_TO_SESSION Basic xdr and processing for BIND_CONN_TO_SESSION. This adds a connection to the list of connections associated with a session. Signed-off-by: J. Bruce Fields --- include/linux/nfs4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 26afa30..a591893 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -65,6 +65,9 @@ #define NFS4_CDFC4_FORE 0x1 #define NFS4_CDFC4_BACK 0x2 +#define NFS4_CDFC4_BOTH 0x3 +#define NFS4_CDFC4_FORE_OR_BOTH 0x3 +#define NFS4_CDFC4_BACK_OR_BOTH 0x7 #define NFS4_SET_TO_SERVER_TIME 0 #define NFS4_SET_TO_CLIENT_TIME 1 -- cgit v1.1 From d75faea330dbd1873c9094e9926ae306590c0998 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 30 Nov 2010 19:15:01 -0500 Subject: rpc: move sk_bc_xprt to svc_xprt This seems obviously transport-level information even if it's currently used only by the server socket code. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + include/linux/sunrpc/svcsock.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index c8f81da..7ad9751 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -79,6 +79,7 @@ struct svc_xprt { struct list_head xpt_users; /* callbacks on free */ struct net *xpt_net; + struct rpc_xprt *xpt_bc_xprt; /* NFSv4.1 backchannel */ }; static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 1b353a7..04dba23 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -28,7 +28,6 @@ struct svc_sock { /* private TCP part */ u32 sk_reclen; /* length of record */ u32 sk_tcplen; /* current read length */ - struct rpc_xprt *sk_bc_xprt; /* NFSv4.1 backchannel xprt */ }; /* -- cgit v1.1 From f0418aa4b1103f959d64dc18273efa04ee0140e9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 8 Dec 2010 13:48:19 -0500 Subject: rpc: allow xprt_class->setup to return a preexisting xprt This allows us to reuse the xprt associated with a server connection if one has already been set up. Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 89d10d2..bef0f53 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -321,6 +321,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); #define XPRT_CLOSING (6) #define XPRT_CONNECTION_ABORT (7) #define XPRT_CONNECTION_CLOSE (8) +#define XPRT_INITIALIZED (9) static inline void xprt_set_connected(struct rpc_xprt *xprt) { -- cgit v1.1 From 4cb18728709683c91a5f6f8d5f337bfb498b089a Mon Sep 17 00:00:00 2001 From: "R.Durgadoss" Date: Wed, 27 Oct 2010 03:33:29 +0530 Subject: thermal: Add event notification to thermal framework This patch adds event notification support to the generic thermal sysfs framework in the kernel. The notification is in the form of a netlink event. Signed-off-by: R.Durgadoss Signed-off-by: Len Brown --- include/linux/thermal.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1de8b9eb..7846449 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -127,6 +127,37 @@ struct thermal_zone_device { struct thermal_hwmon_attr temp_crit; /* hwmon sys attr */ #endif }; +/* Adding event notification support elements */ +#define THERMAL_GENL_FAMILY_NAME "thermal_event" +#define THERMAL_GENL_VERSION 0x01 +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_group" + +enum events { + THERMAL_AUX0, + THERMAL_AUX1, + THERMAL_CRITICAL, + THERMAL_DEV_FAULT, +}; + +struct thermal_genl_event { + u32 orig; + enum events event; +}; +/* attributes of thermal_genl_family */ +enum { + THERMAL_GENL_ATTR_UNSPEC, + THERMAL_GENL_ATTR_EVENT, + __THERMAL_GENL_ATTR_MAX, +}; +#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) + +/* commands supported by the thermal_genl_family */ +enum { + THERMAL_GENL_CMD_UNSPEC, + THERMAL_GENL_CMD_EVENT, + __THERMAL_GENL_CMD_MAX, +}; +#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) struct thermal_zone_device *thermal_zone_device_register(char *, int, void *, struct @@ -146,5 +177,6 @@ struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, thermal_cooling_device_ops *); void thermal_cooling_device_unregister(struct thermal_cooling_device *); +extern int generate_netlink_event(u32 orig, enum events event); #endif /* __THERMAL_H__ */ -- cgit v1.1 From 3a37898d507794cfc68a092303e02651d3f01308 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 13 Dec 2010 13:36:15 +0800 Subject: ACPICA: Rename some function and variable names Some function and variable names are renamed to be consistent with ACPICA code base. acpi_raw_enable_gpe -> acpi_ev_add_gpe_reference acpi_raw_disable_gpe -> acpi_ev_remove_gpe_reference acpi_gpe_can_wake -> acpi_setup_gpe_for_wake acpi_gpe_wakeup -> acpi_set_gpe_wake_mask acpi_update_gpes -> acpi_update_all_gpes acpi_all_gpes_initialized -> acpi_gbl_all_gpes_initialized acpi_handler_info -> acpi_gpe_handler_info ... Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 6 +++--- include/acpi/actypes.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 53b7cfd..5f8ccf1 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -292,11 +292,11 @@ acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number); acpi_status acpi_disable_gpe(acpi_handle gpe_device, u32 gpe_number); -acpi_status acpi_gpe_can_wake(acpi_handle gpe_device, u32 gpe_number); +acpi_status acpi_setup_gpe_for_wake(acpi_handle gpe_device, u32 gpe_number); acpi_status acpi_clear_gpe(acpi_handle gpe_device, u32 gpe_number); -acpi_status acpi_gpe_wakeup(acpi_handle gpe_device, u32 gpe_number, u8 action); +acpi_status acpi_set_gpe_wake_mask(acpi_handle gpe_device, u32 gpe_number, u8 action); acpi_status acpi_get_gpe_status(acpi_handle gpe_device, @@ -315,7 +315,7 @@ acpi_install_gpe_block(acpi_handle gpe_device, acpi_status acpi_remove_gpe_block(acpi_handle gpe_device); -acpi_status acpi_update_gpes(void); +acpi_status acpi_update_all_gpes(void); /* * Resource interfaces diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 2b134b6..e738939 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -656,11 +656,11 @@ typedef u32 acpi_event_status; #define ACPI_GPE_MAX 0xFF #define ACPI_NUM_GPE 256 -/* Actions for acpi_gpe_wakeup, acpi_hw_low_set_gpe */ +/* Actions for acpi_set_gpe_wake_mask, acpi_hw_low_set_gpe */ #define ACPI_GPE_ENABLE 0 #define ACPI_GPE_DISABLE 1 -#define ACPI_GPE_COND_ENABLE 2 +#define ACPI_GPE_CONDITIONAL_ENABLE 2 /* * GPE info flags - Per GPE -- cgit v1.1 From 8b6cd8ad18def34bfc5045b2a0234329bf94cf78 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 13 Dec 2010 13:38:46 +0800 Subject: ACPICA: New GPE handler callback definition The new GPE handler callback has 2 additional parameters, gpe_device and gpe_number. typedef u32 (*acpi_gpe_handler) (acpi_handle gpe_device, u32 gpe_number, void *context); Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 4 ++-- include/acpi/actypes.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 5f8ccf1..b806e56 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -258,11 +258,11 @@ acpi_remove_address_space_handler(acpi_handle device, acpi_status acpi_install_gpe_handler(acpi_handle gpe_device, u32 gpe_number, - u32 type, acpi_event_handler address, void *context); + u32 type, acpi_gpe_handler address, void *context); acpi_status acpi_remove_gpe_handler(acpi_handle gpe_device, - u32 gpe_number, acpi_event_handler address); + u32 gpe_number, acpi_gpe_handler address); #ifdef ACPI_FUTURE_USAGE acpi_status acpi_install_exception_handler(acpi_exception_handler handler); diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index e738939..b9575ad 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -897,6 +897,9 @@ typedef void typedef u32(*acpi_event_handler) (void *context); typedef +u32 (*acpi_gpe_handler) (acpi_handle gpe_device, u32 gpe_number, void *context); + +typedef void (*acpi_notify_handler) (acpi_handle device, u32 value, void *context); typedef -- cgit v1.1 From bba63a296ffab20e08d9e8252d2f0d99050ac859 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 13 Dec 2010 13:39:17 +0800 Subject: ACPICA: Implicit notify support This feature provides an automatic device notification for wake devices when a wakeup GPE occurs and there is no corresponding GPE method or handler. Rather than ignoring such a GPE, an implicit AML Notify operation is performed on the parent device object. This feature is not part of the ACPI specification and is provided for Windows compatibility only. Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 6 ++++-- include/acpi/actypes.h | 37 +++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index b806e56..9de6a17 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -292,10 +292,12 @@ acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number); acpi_status acpi_disable_gpe(acpi_handle gpe_device, u32 gpe_number); -acpi_status acpi_setup_gpe_for_wake(acpi_handle gpe_device, u32 gpe_number); - acpi_status acpi_clear_gpe(acpi_handle gpe_device, u32 gpe_number); +acpi_status +acpi_setup_gpe_for_wake(acpi_handle parent_device, + acpi_handle gpe_device, u32 gpe_number); + acpi_status acpi_set_gpe_wake_mask(acpi_handle gpe_device, u32 gpe_number, u8 action); acpi_status diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index b9575ad..d7274ee 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -664,25 +664,26 @@ typedef u32 acpi_event_status; /* * GPE info flags - Per GPE - * +-------+---+-+-+ - * | 7:4 |3:2|1|0| - * +-------+---+-+-+ - * | | | | - * | | | +--- Interrupt type: edge or level triggered - * | | +----- GPE can wake the system - * | +-------- Type of dispatch:to method, handler, or none - * +-------------- + * +-------+-+-+---+ + * | 7:4 |3|2|1:0| + * +-------+-+-+---+ + * | | | | + * | | | +-- Type of dispatch:to method, handler, notify, or none + * | | +----- Interrupt type: edge or level triggered + * | +------- Is a Wake GPE + * +------------ */ -#define ACPI_GPE_XRUPT_TYPE_MASK (u8) 0x01 -#define ACPI_GPE_LEVEL_TRIGGERED (u8) 0x01 -#define ACPI_GPE_EDGE_TRIGGERED (u8) 0x00 +#define ACPI_GPE_DISPATCH_NONE (u8) 0x00 +#define ACPI_GPE_DISPATCH_METHOD (u8) 0x01 +#define ACPI_GPE_DISPATCH_HANDLER (u8) 0x02 +#define ACPI_GPE_DISPATCH_NOTIFY (u8) 0x03 +#define ACPI_GPE_DISPATCH_MASK (u8) 0x03 -#define ACPI_GPE_CAN_WAKE (u8) 0x02 +#define ACPI_GPE_LEVEL_TRIGGERED (u8) 0x04 +#define ACPI_GPE_EDGE_TRIGGERED (u8) 0x00 +#define ACPI_GPE_XRUPT_TYPE_MASK (u8) 0x04 -#define ACPI_GPE_DISPATCH_MASK (u8) 0x0C -#define ACPI_GPE_DISPATCH_HANDLER (u8) 0x04 -#define ACPI_GPE_DISPATCH_METHOD (u8) 0x08 -#define ACPI_GPE_DISPATCH_NOT_USED (u8) 0x00 +#define ACPI_GPE_CAN_WAKE (u8) 0x08 /* * Flags for GPE and Lock interfaces @@ -954,6 +955,10 @@ u32 (*acpi_interface_handler) (acpi_string interface_name, u32 supported); #define ACPI_INTERRUPT_NOT_HANDLED 0x00 #define ACPI_INTERRUPT_HANDLED 0x01 +/* GPE handler return values */ + +#define ACPI_REENABLE_GPE 0x80 + /* Length of 32-bit EISAID values when converted back to a string */ #define ACPI_EISAID_STRING_SIZE 8 /* Includes null terminator */ -- cgit v1.1 From a0fcdb237fcd4eaa7e5009b28ef5be07415f287d Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 13 Dec 2010 13:39:26 +0800 Subject: ACPICA: Global event handler The global event handler is called whenever a general purpose or fixed ACPI event occurs. Also update Linux OSL to collect events counter with global event handler. Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 4 ++++ include/acpi/actypes.h | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9de6a17..e8142d5 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -229,6 +229,10 @@ acpi_status acpi_install_initialization_handler(acpi_init_handler handler, u32 function); acpi_status +acpi_install_global_event_handler(ACPI_GBL_EVENT_HANDLER handler, + void *context); + +acpi_status acpi_install_fixed_event_handler(u32 acpi_event, acpi_event_handler handler, void *context); diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index d7274ee..939a431 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -895,6 +895,14 @@ typedef void /* * Various handlers and callback procedures */ +typedef +void (*ACPI_GBL_EVENT_HANDLER) (u32 event_type, + acpi_handle device, + u32 event_number, void *context); + +#define ACPI_EVENT_TYPE_GPE 0 +#define ACPI_EVENT_TYPE_FIXED 1 + typedef u32(*acpi_event_handler) (void *context); typedef -- cgit v1.1 From 1ae5ec903f71c0ffa583ec54d17415892036ee18 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 27 Dec 2010 09:07:43 +0800 Subject: ACPICA: Update version to 20101209 Version 20101209. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index e8142d5..241b8a0 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20101013 +#define ACPI_CA_VERSION 0x20101209 #include "actypes.h" #include "actbl.h" -- cgit v1.1 From 25eed40720fc9005c63a1f436e5f8a78836c26ff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 25 Nov 2010 00:09:15 +0100 Subject: ACPI / PM: Add function for updating device power state consistently Add function acpi_bus_update_power() for reading the actual power state of an ACPI device and updating its device->power.state field in such a way that its power resources' reference counters will remain consistent with that field. For this purpose introduce __acpi_bus_set_power() setting the power state of an ACPI device without updating its device->power.state field and make acpi_bus_set_power() and acpi_bus_update_power() use it (acpi_bus_set_power() retains the current behavior for now). Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 359ef11..5d2c4c5 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -330,6 +330,7 @@ acpi_status acpi_bus_get_status_handle(acpi_handle handle, int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_get_power(acpi_handle handle, int *state); int acpi_bus_set_power(acpi_handle handle, int state); +int acpi_bus_update_power(acpi_handle handle, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); bool acpi_bus_can_wakeup(acpi_handle handle); #ifdef CONFIG_ACPI_PROC_EVENT -- cgit v1.1 From 488a76c52606199100adf09c8eb7cbedbd94e9d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 25 Nov 2010 00:11:24 +0100 Subject: ACPI / Fan: Rework the handling of power resources Use the new function acpi_bus_update_power() for manipulating power resources used by ACPI fan devices, which allows them to be put into the right state during initialization and resume. Consequently, remove the flags.force_power_state field from struct acpi_device, which is not necessary any more. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5d2c4c5..8912580 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -149,8 +149,7 @@ struct acpi_device_flags { u32 power_manageable:1; u32 performance_manageable:1; u32 wake_capable:1; /* Wakeup(_PRW) supported? */ - u32 force_power_state:1; - u32 reserved:22; + u32 reserved:23; }; /* File System */ -- cgit v1.1 From f6767dcf2a4f6e62960912d0affec1e15a246191 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Dec 2010 23:44:39 +0100 Subject: ACPI / PM: Drop acpi_bus_get_power() There are no more users of acpi_bus_get_power(), so it can be dropped. Moreover, it should be dropped, because it modifies the device->power.state field of an ACPI device without updating the reference counters of the device's power resources, which is wrong. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 8912580..673a3f4 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -327,7 +327,6 @@ void acpi_bus_data_handler(acpi_handle handle, void *context); acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); -int acpi_bus_get_power(acpi_handle handle, int *state); int acpi_bus_set_power(acpi_handle handle, int state); int acpi_bus_update_power(acpi_handle handle, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); -- cgit v1.1 From d57d09a480e1db38eeee7629c81289b00f338a15 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jan 2011 23:41:27 +0100 Subject: ACPI: Drop device flag wake_capable The wake_capable ACPI device flag is not necessary, because it is only used in scan.c for recording the information that _PRW is present for the given device. That information is only used by acpi_add_single_object() to decide whether or not to call acpi_bus_get_wakeup_device_flags(), so the flag may be dropped if the _PRW check is moved to acpi_bus_get_wakeup_device_flags(). Moreover, acpi_bus_get_wakeup_device_flags() always returns 0, so it really should be void. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 20b05cd..78ca429 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -148,8 +148,7 @@ struct acpi_device_flags { u32 suprise_removal_ok:1; u32 power_manageable:1; u32 performance_manageable:1; - u32 wake_capable:1; /* Wakeup(_PRW) supported? */ - u32 reserved:23; + u32 reserved:24; }; /* File System */ -- cgit v1.1 From d247632c08c674864d438733280422ddb7130ff8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 12 Jan 2011 02:34:59 -0500 Subject: cpuidle: delete NOP CPUIDLE_FLAG_POLL it serves no purpose Signed-off-by: Len Brown --- include/linux/cpuidle.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 1be416b..dee3285 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -48,7 +48,6 @@ struct cpuidle_state { /* Idle State Flags */ #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ #define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ -#define CPUIDLE_FLAG_POLL (0x10) /* no latency, no savings */ #define CPUIDLE_FLAG_SHALLOW (0x20) /* low latency, minimal savings */ #define CPUIDLE_FLAG_BALANCED (0x40) /* medium latency, moderate savings */ #define CPUIDLE_FLAG_DEEP (0x80) /* high latency, large savings */ -- cgit v1.1 From 642f11c53f17aee991d97d14e6922172849ef227 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 12 Jan 2011 02:45:00 -0500 Subject: cpuidle: delete unused CPUIDLE_FLAG_SHALLOW, BALANCED, DEEP definitions Signed-off-by: Len Brown --- include/linux/cpuidle.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index dee3285..c252953 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -48,9 +48,6 @@ struct cpuidle_state { /* Idle State Flags */ #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ #define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ -#define CPUIDLE_FLAG_SHALLOW (0x20) /* low latency, minimal savings */ -#define CPUIDLE_FLAG_BALANCED (0x40) /* medium latency, moderate savings */ -#define CPUIDLE_FLAG_DEEP (0x80) /* high latency, large savings */ #define CPUIDLE_FLAG_IGNORE (0x100) /* ignore during this idle period */ #define CPUIDLE_FLAG_TLB_FLUSHED (0x200) /* tlb will be flushed */ -- cgit v1.1 From 956d033fb2eb3f8818260cdf01644bf4dc1a9e33 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 12 Jan 2011 02:51:20 -0500 Subject: cpuidle: CPUIDLE_FLAG_TLB_FLUSHED is specific to intel_idle Signed-off-by: Len Brown --- include/linux/cpuidle.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index c252953..6be722c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -49,7 +49,6 @@ struct cpuidle_state { #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ #define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ #define CPUIDLE_FLAG_IGNORE (0x100) /* ignore during this idle period */ -#define CPUIDLE_FLAG_TLB_FLUSHED (0x200) /* tlb will be flushed */ #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) -- cgit v1.1 From 5392083748a340f68052c0b83a7ce28b10324972 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 12 Jan 2011 02:56:15 -0500 Subject: cpuidle: CPUIDLE_FLAG_CHECK_BM is omap3_idle specific Signed-off-by: Len Brown --- include/linux/cpuidle.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 6be722c..36719ea 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -47,7 +47,6 @@ struct cpuidle_state { /* Idle State Flags */ #define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ -#define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ #define CPUIDLE_FLAG_IGNORE (0x100) /* ignore during this idle period */ #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) -- cgit v1.1 From f00c9e44ad1a9660fe8cd3ca15b6cd9497172eab Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Sep 2010 17:38:58 +0200 Subject: quota: Fix deadlock during path resolution As Al Viro pointed out path resolution during Q_QUOTAON calls to quotactl is prone to deadlocks. We hold s_umount semaphore for reading during the path resolution and resolution itself may need to acquire the semaphore for writing when e. g. autofs mountpoint is passed. Solve the problem by performing the resolution before we get hold of the superblock (and thus s_umount semaphore). The whole thing is complicated by the fact that some filesystems (OCFS2) ignore the path argument. So to distinguish between filesystem which want the path and which do not we introduce new .quota_on_meta callback which does not get the path. OCFS2 then uses this callback instead of old .quota_on. CC: Al Viro CC: Christoph Hellwig CC: Ted Ts'o CC: Joel Becker Signed-off-by: Jan Kara --- include/linux/quota.h | 5 ++++- include/linux/quotaops.h | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/quota.h b/include/linux/quota.h index 94c1f03..9a85412 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -322,9 +322,12 @@ struct dquot_operations { qsize_t *(*get_reserved_space) (struct inode *); }; +struct path; + /* Operations handling requests from userspace */ struct quotactl_ops { - int (*quota_on)(struct super_block *, int, int, char *); + int (*quota_on)(struct super_block *, int, int, struct path *); + int (*quota_on_meta)(struct super_block *, int, int); int (*quota_off)(struct super_block *, int); int (*quota_sync)(struct super_block *, int, int); int (*get_info)(struct super_block *, int, struct if_dqinfo *); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 223b14c..eb354f6 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -76,11 +76,9 @@ int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); -int dquot_quota_on(struct super_block *sb, int type, int format_id, - char *path); int dquot_enable(struct inode *inode, int type, int format_id, unsigned int flags); -int dquot_quota_on_path(struct super_block *sb, int type, int format_id, +int dquot_quota_on(struct super_block *sb, int type, int format_id, struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); -- cgit v1.1 From 2fc72c7b84002ffb3c66918e2a7b0ee607d8b5aa Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 12 Jan 2011 20:25:08 +0100 Subject: netfilter: fix compilation when conntrack is disabled but tproxy is enabled The IPv6 tproxy patches split IPv6 defragmentation off of conntrack, but failed to update the #ifdef stanzas guarding the defragmentation related fields and code in skbuff and conntrack related code in nf_defrag_ipv6.c. This patch adds the required #ifdefs so that IPv6 tproxy can truly be used without connection tracking. Original report: http://marc.info/?l=linux-netdev&m=129010118516341&w=2 Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: KOVACS Krisztian Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 15 +++++++++++++++ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 10 ---------- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 10 ++++++++++ 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 20ec0a64..bf221d6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -255,6 +255,11 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif +#if defined(CONFIG_NF_DEFRAG_IPV4) || defined(CONFIG_NF_DEFRAG_IPV4_MODULE) || \ + defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) +#define NET_SKBUFF_NF_DEFRAG_NEEDED 1 +#endif + /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -362,6 +367,8 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED struct sk_buff *nfct_reasm; #endif #ifdef CONFIG_BRIDGE_NETFILTER @@ -2057,6 +2064,8 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) if (nfct) atomic_inc(&nfct->use); } +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED static inline void nf_conntrack_get_reasm(struct sk_buff *skb) { if (skb) @@ -2085,6 +2094,8 @@ static inline void nf_reset(struct sk_buff *skb) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb->nfct); skb->nfct = NULL; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(skb->nfct_reasm); skb->nfct_reasm = NULL; #endif @@ -2101,6 +2112,8 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) dst->nfct = src->nfct; nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED dst->nfct_reasm = src->nfct_reasm; nf_conntrack_get_reasm(src->nfct_reasm); #endif @@ -2114,6 +2127,8 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED nf_conntrack_put_reasm(dst->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 1ee717e..a4c9936 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -7,16 +7,6 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; -extern int nf_ct_frag6_init(void); -extern void nf_ct_frag6_cleanup(void); -extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); -extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, - struct net_device *out, - int (*okfn)(struct sk_buff *)); - -struct inet_frags_ctl; - #include extern struct ctl_table nf_ct_ipv6_sysctl_table[]; diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 94dd54d..fd79c9a 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -3,4 +3,14 @@ extern void nf_defrag_ipv6_enable(void); +extern int nf_ct_frag6_init(void); +extern void nf_ct_frag6_cleanup(void); +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); +extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, + struct net_device *out, + int (*okfn)(struct sk_buff *)); + +struct inet_frags_ctl; + #endif /* _NF_DEFRAG_IPV6_H */ -- cgit v1.1 From 6fed05c9c9812b5882bc708f4da4fa8d5df2875c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jan 2011 22:03:20 +0100 Subject: ACPI / PM: Fix build problems for !CONFIG_ACPI related to NVS rework The recent rework of the NVS saving/restoring code introduced two build issues for !CONFIG_ACPI, a warning in drivers/acpi/internal.h and an error in arch/x86/kernel/e820.c. Fix them by providing suitable static inline definitions of the relevant functions. Signed-off-by: Rafael J. Wysocki Acked-by: Randy Dunlap Signed-off-by: Len Brown --- include/linux/acpi.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fa7ed6a98..eb176bb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -254,15 +254,6 @@ void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_ACPI_SLEEP -int suspend_nvs_register(unsigned long start, unsigned long size); -#else -static inline int suspend_nvs_register(unsigned long a, unsigned long b) -{ - return 0; -} -#endif - struct acpi_osc_context { char *uuid_str; /* uuid string */ int rev; @@ -361,4 +352,14 @@ static inline int acpi_table_parse(char *id, return -1; } #endif /* !CONFIG_ACPI */ + +#ifdef CONFIG_ACPI_SLEEP +int suspend_nvs_register(unsigned long start, unsigned long size); +#else +static inline int suspend_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +#endif + #endif /*_LINUX_ACPI_H*/ -- cgit v1.1 From da995a8aee044bc5d0847e19e351cd48a2cb8bcc Mon Sep 17 00:00:00 2001 From: Aleksey Senin Date: Thu, 2 Dec 2010 11:44:49 +0000 Subject: IB/mlx4: Handle protocol field in multicast table The newest device firmware stores IB vs. Ethernet protocol in two bits in members_count field of multicast group table (0: Infiniband, 1: Ethernet). When changing the QP members count for a multicast group, it important not to reset this information. When calling multicast attach first time, the protocol type should be specified. In this patch we always set it IB, but in the future we will handle Ethernet too. When looking for a QP, the protocol type shoud be checked too. Signed-off-by: Aleksey Senin Signed-off-by: Roland Dreier --- include/linux/mlx4/device.h | 10 ++++++++-- include/linux/mlx4/driver.h | 6 +----- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a7b15bc..0492146 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -144,6 +144,11 @@ enum { MLX4_STAT_RATE_OFFSET = 5 }; +enum mlx4_protocol { + MLX4_PROTOCOL_IB, + MLX4_PROTOCOL_EN, +}; + enum { MLX4_MTT_FLAG_PRESENT = 1 }; @@ -500,8 +505,9 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port); int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - int block_mcast_loopback); -int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); + int block_mcast_loopback, enum mlx4_protocol protocol); +int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol protocol); int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index); void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index); diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index f407cd4..e1eebf7 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -34,6 +34,7 @@ #define MLX4_DRIVER_H #include +#include struct mlx4_dev; @@ -44,11 +45,6 @@ enum mlx4_dev_event { MLX4_DEV_EVENT_PORT_REINIT, }; -enum mlx4_protocol { - MLX4_PROTOCOL_IB, - MLX4_PROTOCOL_EN, -}; - struct mlx4_interface { void * (*add) (struct mlx4_dev *dev); void (*remove)(struct mlx4_dev *dev, void *context); -- cgit v1.1 From 838b4dc6d8a6bd83a93077ebc6873712c65bf85e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 12 Jan 2011 15:42:32 +0000 Subject: sched: remove unused backlog in RED stats The RED statistics structure includes backlog field which is not set or used by any code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/red.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/red.h b/include/net/red.h index 995108e..3319f16 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -97,7 +97,6 @@ struct red_stats { u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ u32 other; /* Drops due to drop() calls */ - u32 backlog; }; struct red_parms { -- cgit v1.1 From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Thu, 13 Jan 2011 19:59:47 +0000 Subject: dm ioctl: allow rename to fill empty uuid Allow the uuid of a mapped device to be set after device creation. Previously the uuid (which is optional) could only be set by DM_DEV_CREATE. If no uuid was supplied it could not be set later. Sometimes it's necessary to create the device before the uuid is known, and in such cases the uuid must be filled in after the creation. This patch extends DM_DEV_RENAME to accept a uuid accompanied by a new flag DM_UUID_FLAG. This can only be done once and if no uuid was previously supplied. It cannot be used to change an existing uuid. DM_VERSION_MINOR is also bumped to 19 to indicate this interface extension is available. Signed-off-by: Peter Jones Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- include/linux/dm-ioctl.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 49eab36..e453e11 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -44,7 +44,7 @@ * Remove a device, destroy any tables. * * DM_DEV_RENAME: - * Rename a device. + * Rename a device or set its uuid if none was previously supplied. * * DM_SUSPEND: * This performs both suspend and resume, depending which flag is @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 18 +#define DM_VERSION_MINOR 19 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" +#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -322,4 +322,10 @@ enum { */ #define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit v1.1 From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:52 +0000 Subject: dm log userspace: add version number to comms This patch adds a 'version' field to the 'dm_ulog_request' structure. The 'version' field is taken from a portion of the unused 'padding' field in the 'dm_ulog_request' structure. This was done to avoid changing the size of the structure and possibly disrupting backwards compatibility. The version number will help notify user-space daemons when a change has been made to the kernel/userspace log API. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- include/linux/dm-log-userspace.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 0c3c3a2..eeace7d 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -370,6 +370,16 @@ #define DM_ULOG_REQUEST_TYPE(request_type) \ (DM_ULOG_REQUEST_MASK & (request_type)) +/* + * DM_ULOG_REQUEST_VERSION is incremented when there is a + * change to the way information is passed between kernel + * and userspace. This could be a structure change of + * dm_ulog_request or a change in the way requests are + * issued/handled. Changes are outlined here: + * version 1: Initial implementation + */ +#define DM_ULOG_REQUEST_VERSION 1 + struct dm_ulog_request { /* * The local unique identifier (luid) and the universally unique @@ -383,8 +393,9 @@ struct dm_ulog_request { */ uint64_t luid; char uuid[DM_UUID_LEN]; - char padding[7]; /* Padding because DM_UUID_LEN = 129 */ + char padding[3]; /* Padding because DM_UUID_LEN = 129 */ + uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ int32_t error; /* Used to report back processing errors */ uint32_t seq; /* Sequence number for request */ -- cgit v1.1 From 9c4376de98719d2768dd919553843de34bb094a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:58 +0000 Subject: dm: use non reentrant workqueues if equivalent kmirrord_wq, kcopyd_work and md->wq are created per dm instance and serve only a single work item from the dm instance, so non-reentrant workqueues would provide the same ordering guarantees as ordered ones while allowing CPU affinity and use of the workqueues for other purposes. Switch them to non-reentrant workqueues. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- include/linux/dm-ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index e453e11..78bbf47 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -268,8 +268,8 @@ enum { #define DM_VERSION_MAJOR 4 #define DM_VERSION_MINOR 19 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.1 From 9d357b0787bb3c91835d5e658c3bda178f9ca419 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:01 +0000 Subject: dm: introduce target callbacks and congestion callback DM currently implements congestion checking by checking on congestion in each component device. For raid456 we need to also check if the stripe cache is congested. Add per-target congestion checker callback support. Extending the target_callbacks structure with additional callback functions allows for establishing multiple callbacks per-target (a callback is also needed for unplug). Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- include/linux/device-mapper.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2970022..4b1c63d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -193,6 +193,12 @@ struct dm_target { char *error; }; +/* Each target can link one of these into the table */ +struct dm_target_callbacks { + struct list_head list; + int (*congested_fn) (struct dm_target_callbacks *, int); +}; + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); @@ -269,6 +275,11 @@ int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params); /* + * Target_ctr should call this if it needs to add any callbacks. + */ +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); + +/* * Finally call this to make the table ready for use. */ int dm_table_complete(struct dm_table *t); -- cgit v1.1 From 99d03c141b40914b67d63c9d23b8da4386422ed7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:02 +0000 Subject: dm: per target unplug callback support Add per-target unplug callback support. Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- include/linux/device-mapper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4b1c63d..272496d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -197,6 +197,7 @@ struct dm_target { struct dm_target_callbacks { struct list_head list; int (*congested_fn) (struct dm_target_callbacks *, int); + void (*unplug_fn)(struct dm_target_callbacks *); }; int dm_register_target(struct target_type *t); -- cgit v1.1 From 8d661f1e462d50bd83de87ee628aaf820ce3c66c Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Tue, 11 Jan 2011 16:14:24 -0800 Subject: ieee80211: correct IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK macro It is defined in include/linux/ieee80211.h. As per IEEE spec. bit6 to bit15 in block ack parameter represents buffer size. So the bitmask should be 0xFFC0. Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao Cc: stable@kernel.org Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 6042228..294169e 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -959,7 +959,7 @@ struct ieee80211_ht_info { /* block-ack parameters */ #define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 #define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C -#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFA0 +#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 #define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 -- cgit v1.1 From d8a3515e2a9523f8ed56d1f4537d16338bda2bc2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Jan 2011 17:26:46 -0800 Subject: Revert "gpiolib: annotate gpio-intialization with __must_check" This reverts commit 0fdae42d361bbb431ca0ab0efed5126a94821177, which wasn't really supposed to go in, and causes lots of annoying warnings. Quoth Andrew: "Complete brainfart - I meant to drop that patch ages ago." Quoth Greg: "Ick, yeah, that patch isn't ok to go in as-is, all of the callers need to be fixed up first, which is what I thought we had agreed on..." Reported-by: Stephen Rothwell Acked-by: Andrew Morton Acked-by: Greg KH Signed-off-by: Linus Torvalds --- include/asm-generic/gpio.h | 10 +++++----- include/linux/gpio.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 6098cae..ff5c660 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data, /* Always use the library code for GPIO management calls, * or when sleeping may be involved. */ -extern int __must_check gpio_request(unsigned gpio, const char *label); +extern int gpio_request(unsigned gpio, const char *label); extern void gpio_free(unsigned gpio); -extern int __must_check gpio_direction_input(unsigned gpio); -extern int __must_check gpio_direction_output(unsigned gpio, int value); +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); extern int gpio_set_debounce(unsigned gpio, unsigned debounce); @@ -192,8 +192,8 @@ struct gpio { const char *label; }; -extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -extern int __must_check gpio_request_array(struct gpio *array, size_t num); +extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); +extern int gpio_request_array(struct gpio *array, size_t num); extern void gpio_free_array(struct gpio *array, size_t num); #ifdef CONFIG_GPIO_SYSFS diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f79d67f..4b47ed9 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number) return 0; } -static inline int __must_check gpio_request(unsigned gpio, const char *label) +static inline int gpio_request(unsigned gpio, const char *label) { return -ENOSYS; } @@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num) WARN_ON(1); } -static inline int __must_check gpio_direction_input(unsigned gpio) +static inline int gpio_direction_input(unsigned gpio) { return -ENOSYS; } -static inline int __must_check gpio_direction_output(unsigned gpio, int value) +static inline int gpio_direction_output(unsigned gpio, int value) { return -ENOSYS; } -- cgit v1.1 From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2011 15:45:38 -0800 Subject: irq: use per_cpu kstat_irqs Use modern per_cpu API to increment {soft|hard}irq counters, and use per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array. This gives better SMP/NUMA locality and saves few instructions per irq. With small nr_cpuids values (8 for example), kstats_irq was a small array (less than L1_CACHE_BYTES), potentially source of false sharing. In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly kstat_irqs_all[NR_IRQS][NR_CPUS] array. Note: we still populate kstats_irq for all possible irqs in early_irq_init(). We probably could use on-demand allocations. (Code included in alloc_descs()). Problem is not all IRQS are used with a prior alloc_descs() call. kstat_irqs_this_cpu() is not used anymore, remove it. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Andi Kleen Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irqdesc.h | 2 +- include/linux/kernel_stat.h | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68c..6a64c6f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba..0cce2db 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) -- cgit v1.1 From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e2321..0740253 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ -- cgit v1.1 From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:41 -0800 Subject: mm: page allocator: adjust the per-cpu counter threshold when memory is low Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman Reported-by: Shaohua Li Reviewed-by: Christoph Lameter Tested-by: Nicolas Bareil Cc: David Rientjes Cc: Kyle McMartin Cc: [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 +++------- include/linux/vmstat.h | 5 +++++ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24eb..4890662 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -661,7 +655,9 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37..e4cc21c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +void restore_pgdat_percpu_threshold(pg_data_t *pgdat); #else /* CONFIG_SMP */ /* @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif -- cgit v1.1 From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:43 -0800 Subject: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@linux-foundation.org: readability tweak] [kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21c..833e676 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif -- cgit v1.1 From 71927e84e0aebfbe5a91565c3b207af25a4e9162 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 13 Jan 2011 15:45:46 -0800 Subject: writeback: trace wakeup event for background writeback This tracks when balance_dirty_pages() tries to wakeup the flusher thread for background writeback (if it was not started already). Suggested-by: Christoph Hellwig Signed-off-by: Wu Fengguang Cc: Jan Kara Cc: Johannes Weiner Cc: Dave Chinner Cc: Jan Engelhardt Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 89a2b2d..4e249b9 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \ TP_ARGS(bdi)) DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_background); DEFINE_WRITEBACK_EVENT(writeback_wake_thread); DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); -- cgit v1.1 From b7aba6984dc048503b69c2a885098cdd430832bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:54 -0800 Subject: mm: compaction: add trace events for memory compaction activity In preparation for a patches promoting the use of memory compaction over lumpy reclaim, this patch adds trace points for memory compaction activity. Using them, we can monitor the scanning activity of the migration and free page scanners as well as the number and success rates of pages passed to page migration. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 include/trace/events/compaction.h (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h new file mode 100644 index 0000000..388bcdd --- /dev/null +++ b/include/trace/events/compaction.h @@ -0,0 +1,74 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM compaction + +#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_COMPACTION_H + +#include +#include +#include "gfpflags.h" + +DECLARE_EVENT_CLASS(mm_compaction_isolate_template, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken), + + TP_STRUCT__entry( + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_taken) + ), + + TP_fast_assign( + __entry->nr_scanned = nr_scanned; + __entry->nr_taken = nr_taken; + ), + + TP_printk("nr_scanned=%lu nr_taken=%lu", + __entry->nr_scanned, + __entry->nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +TRACE_EVENT(mm_compaction_migratepages, + + TP_PROTO(unsigned long nr_migrated, + unsigned long nr_failed), + + TP_ARGS(nr_migrated, nr_failed), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_failed) + ), + + TP_fast_assign( + __entry->nr_migrated = nr_migrated; + __entry->nr_failed = nr_failed; + ), + + TP_printk("nr_migrated=%lu nr_failed=%lu", + __entry->nr_migrated, + __entry->nr_failed) +); + + +#endif /* _TRACE_COMPACTION_H */ + +/* This part must be outside protection */ +#include -- cgit v1.1 From ee64fc9354e515a79c7232cfde65c88ec627308b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:55 -0800 Subject: mm: vmscan: convert lumpy_mode into a bitmask Currently lumpy_mode is an enum and determines if lumpy reclaim is off, syncronous or asyncronous. In preparation for using compaction instead of lumpy reclaim, this patch converts the flags into a bitmap. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c255fcc..be76429 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, -- cgit v1.1 From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:56 -0800 Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 ++++++++++++++ include/linux/kernel.h | 7 +++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac5155..2592883 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask) +{ + return 0; +} + static inline void defer_compaction(struct zone *zone) { } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac70..5a9d905 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD -- cgit v1.1 From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:57 -0800 Subject: mm: migration: allow migration to operate asynchronously and avoid synchronous compaction in the faster path Migration synchronously waits for writeback if the initial passes fails. Callers of memory compaction do not necessarily want this behaviour if the caller is latency sensitive or expects that synchronous migration is not going to have a significantly better success rate. This patch adds a sync parameter to migrate_pages() allowing the caller to indicate if wait_on_page_writeback() is allowed within migration or not. For reclaim/compaction, try_to_compact_pages() is first called asynchronously, direct reclaim runs and then try_to_compact_pages() is called synchronously as there is a greater expectation that it'll succeed. [akpm@linux-foundation.org: build/merge fix] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 ++++++---- include/linux/migrate.h | 12 ++++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 2592883..72cba40 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,10 +21,11 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, bool sync); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -57,7 +58,8 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { return COMPACT_CONTINUE; } @@ -68,7 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, bool sync) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527f..fa31902 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } -- cgit v1.1 From 7f0f24967b0349798803260b2e4bf347cffa1990 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:58 -0800 Subject: mm: migration: cleanup migrate_pages API by matching types for offlining and sync With the introduction of the boolean sync parameter, the API looks a little inconsistent as offlining is still an int. Convert offlining to a bool for the sake of being tidy. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa31902..e39aeec 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,10 +13,10 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int fail_migrate_page(struct address_space *, @@ -35,10 +35,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } -- cgit v1.1 From f3a310bc4e5ce7e55e1c8e25c31e63af017f3e50 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: vmscan: rename lumpy_mode to reclaim_mode With compaction being used instead of lumpy reclaim, the name lumpy_mode and associated variables is a bit misleading. Rename lumpy_mode to reclaim_mode which is a better fit. There is no functional change. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index be76429..ea422aa 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, -- cgit v1.1 From e5a5623b28198aa91ea71ee5d3846757fc76bc87 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: remove unused get_vm_area_node get_vm_area_node() is unused in the kernel and can thus be removed. Signed-off-by: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f6..cb73c755 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -90,9 +90,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, -- cgit v1.1 From ec3f64fc9c196a304c4b7db3e1ff56d640628509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:01 -0800 Subject: mm: remove gfp mask from pcpu_get_vm_areas pcpu_get_vm_areas() only uses GFP_KERNEL allocations, so remove the gfp_t formal and use the mask internally. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index cb73c755..c7348b8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -117,7 +117,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif -- cgit v1.1 From d0a21265dfb5fa8ae54e90d0fb6d1c215b10a28a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:02 -0800 Subject: mm: unify module_alloc code for vmalloc Four architectures (arm, mips, sparc, x86) use __vmalloc_area() for module_init(). Much of the code is duplicated and can be generalized in a globally accessible function, __vmalloc_node_range(). __vmalloc_node() now calls into __vmalloc_node_range() with a range of [VMALLOC_START, VMALLOC_END) for functionally equivalent behavior. Each architecture may then use __vmalloc_node_range() directly to remove the duplication of code. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Russell King Cc: Ralf Baechle Cc: "David S. Miller" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c7348b8..4ed6fcd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, -- cgit v1.1 From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0740253..f23b5bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -634,6 +634,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations -- cgit v1.1 From 212260aa07135b327752dc02625c68cf4ce04caf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:46:06 -0800 Subject: mm: clear PageError bit in msync & fsync Temporary IO failures, eg. due to loss of both multipath paths, can permanently leave the PageError bit set on a page, resulting in msync or fsync returning -EIO over and over again, even if IO is now getting to the disk correctly. We already clear the AS_ENOSPC and AS_IO bits in mapping->flags in the filemap_fdatawait_range function. Also clearing the PageError bit on the page allows subsequent msync or fsync calls on this file to return without an error, if the subsequent IO succeeds. Unfortunately data written out in the msync or fsync call that returned -EIO can still get lost, because the page dirty bit appears to not get restored on IO error. However, the alternative could be potentially all of memory filling up with uncleanable dirty pages, hanging the system, so there is no nice choice here... Signed-off-by: Rik van Riel Acked-by: Valerie Aurora Acked-by: Jeff Layton Cc: Theodore Ts'o Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c46..4805fde 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,7 +198,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) -- cgit v1.1 From 110d74a921f4d272b47ef6104fcf937df808f4c8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:11 -0800 Subject: mm: add FOLL_MLOCK follow_page flag. Move the code to mlock pages from __mlock_vma_pages_range() to follow_page(). This allows __mlock_vma_pages_range() to not have to break down work into 16-page batches. An additional motivation for doing this within the present patch series is that it'll make it easier for a later chagne to drop mmap_sem when blocking on disk (we'd like to be able to resume at the page that was read from disk instead of at the start of a 16-page batch). Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451..9ade803 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1415,6 +1415,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); -- cgit v1.1 From 088e54658f559a13c2b988086512d76fe9e8f846 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:16 -0800 Subject: mm: remove likely() from mapping_unevictable() The mapping_unevictable() has a likely() around the mapping parameter. This mapping parameter comes from page_mapping() which has an unlikely() that the page will be set as PAGE_MAPPING_ANON, and if so, it will return NULL. One would think that this unlikely() means that the mapping returned by page_mapping() would not be NULL, but where page_mapping() is used just above mapping_unevictable(), that unlikely() is incorrect most of the time. This means that the "likely(mapping)" in mapping_unevictable() is incorrect most of the time. Running the annotated branch profiler on my main box which runs firefox, evolution, xchat and is part of my distcc farm, I had this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 12872836 1269443893 98 mapping_unevictable pagemap.h 51 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The mapping_unevictable() is also a static inline but seems to be used only once in my setup. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. Perhaps this is enough to remove the unlikely from page_mapping() as well. Signed-off-by: Steven Rostedt Reviewed-by: KOSAKI Motohiro Acked-by: Nick Piggin Acked-by: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2d1ffe3..9c66e99 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) static inline int mapping_unevictable(struct address_space *mapping) { - if (likely(mapping)) + if (mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -- cgit v1.1 From e20e87795834f2f14cb53baf657b91d9c39f92c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:17 -0800 Subject: mm: remove unlikely() from page_mapping() page_mapping() has a unlikely that the mapping has PAGE_MAPPING_ANON set. But running the annotated branch profiler on a normal desktop system doing vairous tasks (xchat, evolution, firefox, distcc), it is not really that unlikely that the mapping here will have the PAGE_MAPPING_ANON flag set: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. With this much of an error, it's best to simply remove the unlikely and have the compiler and branch prediction figure this out. Signed-off-by: Steven Rostedt Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ade803..57e11b2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,7 +657,7 @@ static inline struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); if (unlikely(PageSwapCache(page))) mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) + else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } -- cgit v1.1 From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:20 -0800 Subject: mm: kswapd: stop high-order balancing when any suitable zone is balanced Simon Kirby reported the following problem We're seeing cases on a number of servers where cache never fully grows to use all available memory. Sometimes we see servers with 4 GB of memory that never seem to have less than 1.5 GB free, even with a constantly-active VM. In some cases, these servers also swap out while this happens, even though they are constantly reading the working set into memory. We have been seeing this happening for a long time; I don't think it's anything recent, and it still happens on 2.6.36. After some debugging work by Simon, Dave Hansen and others, the prevaling theory became that kswapd is reclaiming order-3 pages requested by SLUB too aggressive about it. There are two apparent problems here. On the target machine, there is a small Normal zone in comparison to DMA32. As kswapd tries to balance all zones, it would continually try reclaiming for Normal even though DMA32 was balanced enough for callers. The second problem is that sleeping_prematurely() does not use the same logic as balance_pgdat() when deciding whether to sleep or not. This keeps kswapd artifically awake. A number of tests were run and the figures from previous postings will look very different for a few reasons. One, the old figures were forcing my network card to use GFP_ATOMIC in attempt to replicate Simon's problem. Second, I previous specified slub_min_order=3 again in an attempt to reproduce Simon's problem. In this posting, I'm depending on Simon to say whether his problem is fixed or not and these figures are to show the impact to the ordinary cases. Finally, the "vmscan" figures are taken from /proc/vmstat instead of the tracepoints. There is less information but recording is less disruptive. The first test of relevance was postmark with a process running in the background reading a large amount of anonymous memory in blocks. The objective was to vaguely simulate what was happening on Simon's machine and it's memory intensive enough to have kswapd awake. POSTMARK traceonly kanyzone Transactions per second: 156.00 ( 0.00%) 153.00 (-1.96%) Data megabytes read per second: 21.51 ( 0.00%) 21.52 ( 0.05%) Data megabytes written per second: 29.28 ( 0.00%) 29.11 (-0.58%) Files created alone per second: 250.00 ( 0.00%) 416.00 (39.90%) Files create/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) Files deleted alone per second: 520.00 ( 0.00%) 420.00 (-23.81%) Files delete/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 16.58 17.4 Total Elapsed Time (seconds) 218.48 222.47 VMstat Reclaim Statistics: vmscan Direct reclaims 0 4 Direct reclaim pages scanned 0 203 Direct reclaim pages reclaimed 0 184 Kswapd pages scanned 326631 322018 Kswapd pages reclaimed 312632 309784 Kswapd low wmark quickly 1 4 Kswapd high wmark quickly 122 475 Kswapd skip congestion_wait 1 0 Pages activated 700040 705317 Pages deactivated 212113 203922 Pages written 9875 6363 Total pages scanned 326631 322221 Total pages reclaimed 312632 309968 %age total pages scanned/reclaimed 95.71% 96.20% %age total pages scanned/written 3.02% 1.97% proc vmstat: Faults Major Faults 300 254 Minor Faults 645183 660284 Page ins 493588 486704 Page outs 4960088 4986704 Swap ins 1230 661 Swap outs 9869 6355 Performance is mildly affected because kswapd is no longer doing as much work and the background memory consumer process is getting in the way. Note that kswapd scanned and reclaimed fewer pages as it's less aggressive and overall fewer pages were scanned and reclaimed. Swap in/out is particularly reduced again reflecting kswapd throwing out fewer pages. The slight performance impact is unfortunate here but it looks like a direct result of kswapd being less aggressive. As the bug report is about too many pages being freed by kswapd, it may have to be accepted for now. The second test is a streaming IO benchmark that was previously used by Johannes to show regressions in page reclaim. MICRO traceonly kanyzone User/Sys Time Running Test (seconds) 29.29 28.87 Total Elapsed Time (seconds) 492.18 488.79 VMstat Reclaim Statistics: vmscan Direct reclaims 2128 1460 Direct reclaim pages scanned 2284822 1496067 Direct reclaim pages reclaimed 148919 110937 Kswapd pages scanned 15450014 16202876 Kswapd pages reclaimed 8503697 8537897 Kswapd low wmark quickly 3100 3397 Kswapd high wmark quickly 1860 7243 Kswapd skip congestion_wait 708 801 Pages activated 9635 9573 Pages deactivated 1432 1271 Pages written 223 1130 Total pages scanned 17734836 17698943 Total pages reclaimed 8652616 8648834 %age total pages scanned/reclaimed 48.79% 48.87% %age total pages scanned/written 0.00% 0.01% proc vmstat: Faults Major Faults 165 221 Minor Faults 9655785 9656506 Page ins 3880 7228 Page outs 37692940 37480076 Swap ins 0 69 Swap outs 19 15 Again fewer pages are scanned and reclaimed as expected and this time the test completed faster. Note that kswapd is hitting its watermarks faster (low and high wmark quickly) which I expect is due to kswapd reclaiming fewer pages. I also ran fs-mark, iozone and sysbench but there is nothing interesting to report in the figures. Performance is not significantly changed and the reclaim statistics look reasonable. Tgis patch: When the allocator enters its slow path, kswapd is woken up to balance the node. It continues working until all zones within the node are balanced. For order-0 allocations, this makes perfect sense but for higher orders it can have unintended side-effects. If the zone sizes are imbalanced, kswapd may reclaim heavily within a smaller zone discarding an excessive number of pages. The user-visible behaviour is that kswapd is awake and reclaiming even though plenty of pages are free from a suitable zone. This patch alters the "balance" logic for high-order reclaim allowing kswapd to stop if any suitable zone becomes balanced to reduce the number of pages it reclaims from other zones. kswapd still tries to ensure that order-0 watermarks for all zones are met before sleeping. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4890662..dad3612 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,6 +639,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -654,7 +655,7 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, -- cgit v1.1 From a826e422420b461a6247137c292ff83c4800354a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:31 -0800 Subject: thp: mm: define MADV_HUGEPAGE Define MADV_HUGEPAGE. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Arnd Bergmann Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/mman-common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3da9e27..e91392f 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -45,6 +45,8 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 -- cgit v1.1 From e9da73d67729b58bba256123e2b4651e0d8a01ac Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: compound_lock Add a new compound_lock() needed to serialize put_page against __split_huge_page_refcount(). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 12 +++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57e11b2..3b1754a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -14,6 +14,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -305,6 +306,39 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +static inline void compound_lock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_lock(PG_compound_lock, &page->flags); +#endif +} + +static inline void compound_unlock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_unlock(PG_compound_lock, &page->flags); +#endif +} + +static inline unsigned long compound_lock_irqsave(struct page *page) +{ + unsigned long uninitialized_var(flags); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + local_irq_save(flags); + compound_lock(page); +#endif + return flags; +} + +static inline void compound_unlock_irqrestore(struct page *page, + unsigned long flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + compound_unlock(page); + local_irq_restore(flags); +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4805fde..5a743cc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -108,6 +108,9 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + PG_compound_lock, +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -397,6 +400,12 @@ static inline void __ClearPageTail(struct page *page) #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) +#else +#define __PG_COMPOUND_LOCK 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -406,7 +415,8 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit v1.1 From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: alter compound get_page/put_page Alter compound get_page/put_page to keep references on subpages too, in order to allow __split_huge_page_refcount to split an hugepage even while subpages have been pinned by one of the get_user_pages() variants. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b1754a..a2718e1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -353,9 +353,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) -- cgit v1.1 From 14fd403f2146f740942d78af4e0ee59396ad8eab Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:37 -0800 Subject: thp: export maybe_mkwrite huge_memory.c needs it too when it fallbacks in copying hugepages into regular fragmented pages if hugepage allocation fails during COW. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2718e1..6bef67d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,6 +430,19 @@ static inline void set_compound_order(struct page *page, unsigned long order) } /* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have -- cgit v1.1 From 5f6e8da70a289d403975907371ce5738c726ad3f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:40 -0800 Subject: thp: special pmd_trans_* functions These returns 0 at compile time when the config option is disabled, to allow gcc to eliminate the transparent hugepage function calls at compile time without additional #ifdefs (only the export of those functions have to be visible to gcc but they won't be required at link time and huge_memory.o can be not built at all). _PAGE_BIT_UNUSED1 is never used for pmd, only on pte. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6f3c6ae..0ab2cd2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -348,6 +348,17 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + return 0; +} +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return 0; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit v1.1 From e2cda322648122dc400c85ada80eaddbc612ef6a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:40 -0800 Subject: thp: add pmd mangling generic functions Some are needed to build but not actually used on archs not supporting transparent hugepages. Others like pmdp_clear_flush are used by x86 too. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 214 ++++++++++++++++++++++++++++++------------ 1 file changed, 154 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 0ab2cd2..f1eddf7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,67 +5,108 @@ #ifdef CONFIG_MMU #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. We return whether the PTE actually changed, which - * in turn instructs the caller to do things like update__mmu_cache. - * This used to be done in the caller, but sparc needs minor faults to - * force that call on sun4c so we changed this macro slightly - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -({ \ - int __changed = !pte_same(*(__ptep), __entry); \ - if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - int r = 1; \ - if (!pte_young(__pte)) \ - r = 0; \ - else \ - set_pte_at((__vma)->vm_mm, (__address), \ - (__ptep), pte_mkold(__pte)); \ - r; \ -}) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + int r = 1; + if (!pte_young(pte)) + r = 0; + else + set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); + return r; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + int r = 1; + if (!pmd_young(pmd)) + r = 0; + else + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); + return r; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young; \ - __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(__mm, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - pte_clear((__mm), (__address), (__ptep)); \ - __pte; \ +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + pte_clear(mm, address, ptep); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + pmd_clear(mm, address, pmdp); + return pmd; }) +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return __pmd(0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ - __pte; \ -}) +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + int full) +{ + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + return pte; +} #endif /* @@ -74,20 +115,25 @@ * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL -#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ -do { \ - pte_clear((__mm), (__address), (__ptep)); \ -} while (0) +static inline void pte_clear_not_present_full(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear(mm, address, ptep); +} #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) +extern pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return pte_val(pte_a) == pte_val(pte_b); +} +#endif + +#ifndef __HAVE_ARCH_PMD_SAME +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return pmd_val(pmd_a) == pmd_val(pmd_b); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY @@ -357,6 +444,13 @@ static inline int pmd_trans_splitting(pmd_t pmd) { return 0; } +#ifndef __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + BUG(); + return 0; +} +#endif /* __HAVE_ARCH_PMD_WRITE */ #endif #endif /* !__ASSEMBLY__ */ -- cgit v1.1 From 8ac1f8320a0073f28cf9e0491af4cd98f504f92a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:43 -0800 Subject: thp: pte alloc trans splitting pte alloc routines must wait for split_huge_page if the pmd is not present and not null (i.e. pmd_trans_splitting). The additional branches are optimized away at compile time by pmd_trans_splitting if the config option is off. However we must pass the vma down in order to know the anon_vma lock to wait for. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bef67d..14ddd98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,7 +1131,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1200,16 +1201,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); -- cgit v1.1 From 91a4ee2670e0ee2b0630a0eb24fb9a87ea3acd0a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: add pmd mmu_notifier helpers Add mmu notifier helpers to handle pmd huge operations. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 43dcfbd..cbfab1e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -243,6 +243,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __pte; \ }) +#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ +({ \ + pmd_t __pmd; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ + __pmd; \ +}) + +#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ +({ \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + pmdp_splitting_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ +}) + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -254,6 +280,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ @@ -305,7 +342,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) } #define ptep_clear_flush_young_notify ptep_clear_flush_young +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.1 From 4e6af67e970a2ac287739a4c526c857b5bda27ec Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: clear page compound split_huge_page must transform a compound page to a regular page and needs ClearPageCompound. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Reviewed-by: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5a743cc..4835cae 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -347,7 +347,7 @@ static inline void set_page_writeback(struct page *page) * tests can be used in performance sensitive paths. PageCompound is * generally not used in hot code paths. */ -__PAGEFLAG(Head, head) +__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) __PAGEFLAG(Tail, tail) static inline int PageCompound(struct page *page) @@ -355,6 +355,13 @@ static inline int PageCompound(struct page *page) return page->flags & ((1L << PG_head) | (1L << PG_tail)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON(!PageHead(page)); + ClearPageHead(page); +} +#endif #else /* * Reduce page flag use as much as possible by overlapping @@ -392,6 +399,14 @@ static inline void __ClearPageTail(struct page *page) page->flags &= ~PG_head_tail_mask; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); + clear_bit(PG_compound, &page->flags); +} +#endif + #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_MMU -- cgit v1.1 From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a..26bc4e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -310,6 +310,9 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ +#endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; }; -- cgit v1.1 From 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: clear_copy_huge_page Move the copy/clear_huge_page functions to common code to share between hugetlb.c and huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 14ddd98..cc6ab10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1589,5 +1589,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.1 From 936a5fe6e6148c0b3ea0d792b903847d9b9931a1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:48 -0800 Subject: thp: kvm mmu transparent hugepage support This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4835cae..907f160 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int PageTransCompound(struct page *page) +{ + return PageCompound(page); +} +#else +static inline int PageTransCompound(struct page *page) +{ + return 0; +} +#endif + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else -- cgit v1.1 From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: _GFP_NO_KSWAPD Transparent hugepage allocations must be allowed not to invoke kswapd or any other kind of indirect reclaim (especially when the defrag sysfs is control disabled). It's unacceptable to swap out anonymous pages (potentially anonymous transparent hugepages) in order to create new transparent hugepages. This is true for the MADV_HUGEPAGE areas too (swapping out a kvm virtual machine and so having it suffer an unbearable slowdown, so another one with guest physical memory marked MADV_HUGEPAGE can run 30% faster if it is running memory intensive workloads, makes no sense). If a transparent hugepage allocation fails the slowdown is minor and there is total fallback, so kswapd should never be asked to swapout memory to allow the high order allocation to succeed. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfc..49d2356 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ -- cgit v1.1 From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core Lately I've been working to make KVM use hugepages transparently without the usual restrictions of hugetlbfs. Some of the restrictions I'd like to see removed: 1) hugepages have to be swappable or the guest physical memory remains locked in RAM and can't be paged out to swap 2) if a hugepage allocation fails, regular pages should be allocated instead and mixed in the same vma without any failure and without userland noticing 3) if some task quits and more hugepages become available in the buddy, guest physical memory backed by regular pages should be relocated on hugepages automatically in regions under madvise(MADV_HUGEPAGE) (ideally event driven by waking up the kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes not null) 4) avoidance of reservation and maximization of use of hugepages whenever possible. Reservation (needed to avoid runtime fatal faliures) may be ok for 1 machine with 1 database with 1 database cache with 1 database cache size known at boot time. It's definitely not feasible with a virtualization hypervisor usage like RHEV-H that runs an unknown number of virtual machines with an unknown size of each virtual machine with an unknown amount of pagecache that could be potentially useful in the host for guest not using O_DIRECT (aka cache=off). hugepages in the virtualization hypervisor (and also in the guest!) are much more important than in a regular host not using virtualization, becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24 to 19 in case only the hypervisor uses transparent hugepages, and they decrease the tlb-miss cacheline accesses from 19 to 15 in case both the linux hypervisor and the linux guest both uses this patch (though the guest will limit the addition speedup to anonymous regions only for now...). Even more important is that the tlb miss handler is much slower on a NPT/EPT guest than for a regular shadow paging or no-virtualization scenario. So maximizing the amount of virtual memory cached by the TLB pays off significantly more with NPT/EPT than without (even if there would be no significant speedup in the tlb-miss runtime). The first (and more tedious) part of this work requires allowing the VM to handle anonymous hugepages mixed with regular pages transparently on regular anonymous vmas. This is what this patch tries to achieve in the least intrusive possible way. We want hugepages and hugetlb to be used in a way so that all applications can benefit without changes (as usual we leverage the KVM virtualization design: by improving the Linux VM at large, KVM gets the performance boost too). The most important design choice is: always fallback to 4k allocation if the hugepage allocation fails! This is the _very_ opposite of some large pagecache patches that failed with -EIO back then if a 64k (or similar) allocation failed... Second important decision (to reduce the impact of the feature on the existing pagetable handling code) is that at any time we can split an hugepage into 512 regular pages and it has to be done with an operation that can't fail. This way the reliability of the swapping isn't decreased (no need to allocate memory when we are short on memory to swap) and it's trivial to plug a split_huge_page* one-liner where needed without polluting the VM. Over time we can teach mprotect, mremap and friends to handle pmd_trans_huge natively without calling split_huge_page*. The fact it can't fail isn't just for swap: if split_huge_page would return -ENOMEM (instead of the current void) we'd need to rollback the mprotect from the middle of it (ideally including undoing the split_vma) which would be a big change and in the very wrong direction (it'd likely be simpler not to call split_huge_page at all and to teach mprotect and friends to handle hugepages instead of rolling them back from the middle). In short the very value of split_huge_page is that it can't fail. The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and incremental and it'll just be an "harmless" addition later if this initial part is agreed upon. It also should be noted that locking-wise replacing regular pages with hugepages is going to be very easy if compared to what I'm doing below in split_huge_page, as it will only happen when page_count(page) matches page_mapcount(page) if we can take the PG_lock and mmap_sem in write mode. collapse_huge_page will be a "best effort" that (unlike split_huge_page) can fail at the minimal sign of trouble and we can try again later. collapse_huge_page will be similar to how KSM works and the madvise(MADV_HUGEPAGE) will work similar to madvise(MADV_MERGEABLE). The default I like is that transparent hugepages are used at page fault time. This can be changed with /sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set to three values "always", "madvise", "never" which mean respectively that hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions, or never used. /sys/kernel/mm/transparent_hugepage/defrag instead controls if the hugepage allocation should defrag memory aggressively "always", only inside "madvise" regions, or "never". The pmd_trans_splitting/pmd_trans_huge locking is very solid. The put_page (from get_user_page users that can't use mmu notifier like O_DIRECT) that runs against a __split_huge_page_refcount instead was a pain to serialize in a way that would result always in a coherent page count for both tail and head. I think my locking solution with a compound_lock taken only after the page_first is valid and is still a PageHead should be safe but it surely needs review from SMP race point of view. In short there is no current existing way to serialize the O_DIRECT final put_page against split_huge_page_refcount so I had to invent a new one (O_DIRECT loses knowledge on the mapping status by the time gup_fast returns so...). And I didn't want to impact all gup/gup_fast users for now, maybe if we change the gup interface substantially we can avoid this locking, I admit I didn't think too much about it because changing the gup unpinning interface would be invasive. If we ignored O_DIRECT we could stick to the existing compound refcounting code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM (and any other mmu notifier user) would call it without FOLL_GET (and if FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the current task mmu notifier list yet). But O_DIRECT is fundamental for decent performance of virtualized I/O on fast storage so we can't avoid it to solve the race of put_page against split_huge_page_refcount to achieve a complete hugepage feature for KVM. Swap and oom works fine (well just like with regular pages ;). MMU notifier is handled transparently too, with the exception of the young bit on the pmd, that didn't have a range check but I think KVM will be fine because the whole point of hugepages is that EPT/NPT will also use a huge pmd when they notice gup returns pages with PageCompound set, so they won't care of a range and there's just the pmd young bit to check in that case. NOTE: in some cases if the L2 cache is small, this may slowdown and waste memory during COWs because 4M of memory are accessed in a single fault instead of 8k (the payoff is that after COW the program can run faster). So we might want to switch the copy_huge_page (and clear_huge_page too) to not temporal stores. I also extensively researched ways to avoid this cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k up to 1M (I can send those patches that fully implemented prefault) but I concluded they're not worth it and they add an huge additional complexity and they remove all tlb benefits until the full hugepage has been faulted in, to save a little bit of memory and some cache during app startup, but they still don't improve substantially the cache-trashing during startup if the prefault happens in >4k chunks. One reason is that those 4k pte entries copied are still mapped on a perfectly cache-colored hugepage, so the trashing is the worst one can generate in those copies (cow of 4k page copies aren't so well colored so they trashes less, but again this results in software running faster after the page fault). Those prefault patches allowed things like a pte where post-cow pages were local 4k regular anon pages and the not-yet-cowed pte entries were pointing in the middle of some hugepage mapped read-only. If it doesn't payoff substantially with todays hardware it will payoff even less in the future with larger l2 caches, and the prefault logic would blot the VM a lot. If one is emebdded transparent_hugepage can be disabled during boot with sysfs or with the boot commandline parameter transparent_hugepage=0 (or transparent_hugepage=2 to restrict hugepages inside madvise regions) that will ensure not a single hugepage is allocated at boot time. It is simple enough to just disable transparent hugepage globally and let transparent hugepages be allocated selectively by applications in the MADV_HUGEPAGE region (both at page fault time, and if enabled with the collapse_huge_page too through the kernel daemon). This patch supports only hugepages mapped in the pmd, archs that have smaller hugepages will not fit in this patch alone. Also some archs like power have certain tlb limits that prevents mixing different page size in the same regions so they will not fit in this framework that requires "graceful fallback" to basic PAGE_SIZE in case of physical memory fragmentation. hugetlbfs remains a perfect fit for those because its software limits happen to match the hardware limits. hugetlbfs also remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped to be found not fragmented after a certain system uptime and that would be very expensive to defragment with relocation, so requiring reservation. hugetlbfs is the "reservation way", the point of transparent hugepages is not to have any reservation at all and maximizing the use of cache and hugepages at all times automatically. Some performance result: vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep ages3 memset page fault 1566023 memset tlb miss 453854 memset second tlb miss 453321 random access tlb miss 41635 random access second tlb miss 41658 vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3 memset page fault 1566471 memset tlb miss 453375 memset second tlb miss 453320 random access tlb miss 41636 random access second tlb miss 41637 vmx andrea # ./largepages3 memset page fault 1566642 memset tlb miss 453417 memset second tlb miss 453313 random access tlb miss 41630 random access second tlb miss 41647 vmx andrea # ./largepages3 memset page fault 1566872 memset tlb miss 453418 memset second tlb miss 453315 random access tlb miss 41618 random access second tlb miss 41659 vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage vmx andrea # ./largepages3 memset page fault 2182476 memset tlb miss 460305 memset second tlb miss 460179 random access tlb miss 44483 random access second tlb miss 44186 vmx andrea # ./largepages3 memset page fault 2182791 memset tlb miss 460742 memset second tlb miss 459962 random access tlb miss 43981 random access second tlb miss 43988 ============ #include #include #include #include #define SIZE (3UL*1024*1024*1024) int main() { char *p = malloc(SIZE), *p2; struct timeval before, after; gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset page fault %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); return 0; } ============ Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 ++ include/linux/huge_mm.h | 118 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 4 ++ include/linux/mm_inline.h | 11 ++++- include/linux/page-flags.h | 21 ++++++++ include/linux/rmap.h | 2 + include/linux/swap.h | 2 + 7 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 include/linux/huge_mm.h (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 49d2356..d95082c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -109,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 0000000..9301824 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,118 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index cc6ab10..78adec4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -111,6 +111,9 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#if BITS_PER_LONG > 32 +#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ +#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -243,6 +246,7 @@ struct inode; * files which need it (119 of them) */ #include +#include /* * Methods to modify the page usage count. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b87..650f31e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -20,14 +20,21 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); + list_add(&page->lru, head); __inc_zone_state(zone, NR_LRU_BASE + l); mem_cgroup_add_lru_list(page, l); } static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + +static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 907f160..4ca1241 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -410,11 +410,32 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + static inline int PageTransCompound(struct page *page) { return PageCompound(page); } + #else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + static inline int PageTransCompound(struct page *page) { return 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0d..e9fd04c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e7..4d55932 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); -- cgit v1.1 From 0af4e98b6b095c74588af04872f83d333c958c32 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: madvise(MADV_HUGEPAGE) Add madvise MADV_HUGEPAGE to mark regions that are important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9301824..d9ab70d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -97,6 +97,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif +extern int hugepage_madvise(unsigned long *vm_flags); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -113,6 +114,11 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +static inline int hugepage_madvise(unsigned long *vm_flags) +{ + BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.1 From 500d65d471018d9a13b0d51b7e141ed2a3555c1d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: pmd_trans_huge migrate bugcheck No pmd_trans_huge should ever materialize in migration ptes areas, because we split the hugepage before migration ptes are instantiated. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 78adec4..2ec5138 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); -- cgit v1.1 From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: transparent hugepage vmstat Add hugepage stat information to /proc/vmstat and /proc/meminfo. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dad3612..02ecb01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* -- cgit v1.1 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 1 + 3 files changed, 68 insertions(+) create mode 100644 include/linux/khugepaged.h (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d..43a694e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,6 +25,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 0000000..552f318 --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if (khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb..d747f94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) -- cgit v1.1 From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: remove PG_buddy PG_buddy can be converted to _mapcount == -2. So the PG_compound_lock can be added to page->flags without overflowing (because of the sparse section bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y. This also has to move the memory hotplug code from _mapcount to lru.next to avoid any risk of clashes. We can't use lru.next for PG_buddy removal, but memory hotplug can use lru.next even more easily than the mapcount instead. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 14 +++++++++----- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/page-flags.h | 7 +------ 3 files changed, 31 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a..24376fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ec5138..7ab7d2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,6 +397,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4ca1241..0db8037 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -233,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -461,7 +456,7 @@ static inline int PageTransCompound(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ __PG_COMPOUND_LOCK) -- cgit v1.1 From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:01 -0800 Subject: thp: add x86 32bit support Add support for transparent hugepages to x86 32bit. Share the same VM_ bitflag for VM_MAPPED_COPY. mm/nommu.c will never support transparent hugepages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ab7d2b..9c2695b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -111,9 +115,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ -#if BITS_PER_LONG > 32 -#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ -#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) -- cgit v1.1 From 0ca1634d4143c3579273ca53b993df19f5c98e92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:02 -0800 Subject: thp: mincore transparent hugepage support Handle transparent huge page pmd entries natively instead of splitting them into subpages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 43a694e..25125fb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, -- cgit v1.1 From cd7548ab360c462118568eebb8c6da3bc303b02e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: mprotect: transparent huge page support Natively handle huge pmds when changing page tables on behalf of mprotect(). I left out update_mmu_cache() because we do not need it on x86 anyway but more importantly the interface works on ptes, not pmds. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 25125fb..c590b08 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,8 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, -- cgit v1.1 From 0bbbc0b33d141f78a0d9218a54a47f50621220d3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: add numa awareness to hugepage allocations It's mostly a matter of replacing alloc_pages with alloc_pages_vma after introducing alloc_pages_vma. khugepaged needs special handling as the allocation has to happen inside collapse_huge_page where the vma is known and an error has to be returned to the outer loop to sleep alloc_sleep_millisecs in case of failure. But it retains the more efficient logic of handling allocation failures in khugepaged in case of CONFIG_NUMA=n. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d95082c..a3b148a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -- cgit v1.1 From 94fcc585fb85ad7b059c70872489b50044d401f3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: avoid breaking huge pmd invariants in case of vma_adjust failures An huge pmd can only be mapped if the corresponding 2M virtual range is fully contained in the vma. At times the VM calls split_vma twice, if the first split_vma succeeds and the second fail, the first split_vma remains in effect and it's not rolled back. For split_vma or vma_adjust to fail an allocation failure is needed so it's a very unlikely event (the out of memory killer would normally fire before any allocation failure is visible to kernel and userland and if an out of memory condition happens it's unlikely to happen exactly here). Nevertheless it's safer to ensure that no huge pmd can be left around if the vma is adjusted in a way that can't fit hugepages anymore at the new vm_start/vm_end address. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c590b08..8275952 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #error "hugepages can't be allocated by the buddy allocator" #endif extern int hugepage_madvise(unsigned long *vm_flags); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags) BUG(); return 0; } +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.1 From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: mmu_notifier_test_young For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cbfab1e..cc2e7df 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -62,6 +62,16 @@ struct mmu_notifier_ops { unsigned long address); /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { -- cgit v1.1 From 5a03b051ed87e72b959f32a86054e1142ac4cf55 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction in kswapd for GFP_ATOMIC order > 0 This takes advantage of memory compaction to properly generate pages of order > 0 if regular page reclaim fails and priority level becomes more severe and we don't reach the proper watermarks. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 72cba40..dfa2ed4 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -25,7 +28,8 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist, bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync); + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -70,9 +74,10 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync) + gfp_t gfp_mask, bool sync, + int compact_mode) { - return 0; + return COMPACT_CONTINUE; } static inline void defer_compaction(struct zone *zone) -- cgit v1.1 From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: fix anon memory statistics with transparent hugepages Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU statistics, so the Active(anon) and Inactive(anon) statistics in /proc/meminfo are correct. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++++++++ include/linux/mm_inline.h | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8275952..9b48c24d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define hpage_nr_pages(x) 1 + #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 650f31e..8f7d247 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { list_add(&page->lru, head); - __inc_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -38,7 +40,7 @@ static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } -- cgit v1.1 From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: compound_trans_order Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n. Signed-off-by: Andrea Arcangeli Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2695b..ce97a2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,6 +450,20 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; -- cgit v1.1 From 1ddd6db43a08cba56c7ee920800980862086f1c3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: mm: define MADV_NOHUGEPAGE Define MADV_NOHUGEPAGE. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/mman-common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index e91392f..787abbb 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -46,6 +46,7 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 -- cgit v1.1 From a664b2d8555c659127bf8fe049a58449d394a707 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: madvise(MADV_NOHUGEPAGE) Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 14 ++++++++------ include/linux/khugepaged.h | 7 ++++--- include/linux/mm.h | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9b48c24d..a8b7e42 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_SIZE HPAGE_SIZE #define transparent_hugepage_enabled(__vma) \ - (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) #define transparent_hugepage_defrag(__vma) \ ((transparent_hugepage_flags & \ (1< MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags); +extern int hugepage_madvise(unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags) +static inline int hugepage_madvise(unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 552f318..6b394f0 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline int khugepaged_enter(struct vm_area_struct *vma) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if (khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index ce97a2b..956a355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ -- cgit v1.1 From 60ab3244ec85c44276c585a2a20d3750402e1cf4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:18 -0800 Subject: thp: khugepaged: make khugepaged aware about madvise MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after mmap and before touching the memory. While this is enough for most usages, it's little effort to make madvise more dynamic at runtime on an existing mapping by making khugepaged aware about madvise. MADV_HUGEPAGE: register in khugepaged immediately without waiting a page fault (that may not ever happen if all pages are already mapped and the "enabled" knob was set to madvise during the initial page faults). MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop collapsing pages where not needed. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andrea Arcangeli Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8b7e42..bddfba1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags, int advice); +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags, int advice) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { BUG(); return 0; -- cgit v1.1 From 22e5c47ee238abe636655c3862ed28d6eb084ad4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:20 -0800 Subject: thp: add compound_trans_head() helper Cleanup some code with common compound_trans_head helper. Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Johannes Weiner Cc: Marcelo Tosatti Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bddfba1..8e6c8c4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,23 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -144,6 +161,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +#define compound_trans_head(page) compound_head(page) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { -- cgit v1.1 From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:47:21 -0800 Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for anonymous pages, as introduced by git commit 989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page migraton"). The point of the RCU protection there is part of getting a stable reference to anon_vma and is only held for anon pages as file pages are locked which is sufficient protection against freeing. However, while a file page's mapping is being migrated, the radix tree is double checked to ensure it is the expected page. This uses radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held triggering the following warning. [ 173.674290] =================================================== [ 173.676016] [ INFO: suspicious rcu_dereference_check() usage. ] [ 173.676016] --------------------------------------------------- [ 173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection! [ 173.676016] [ 173.676016] other info that might help us debug this: [ 173.676016] [ 173.676016] [ 173.676016] rcu_scheduler_active = 1, debug_locks = 0 [ 173.676016] 1 lock held by hugeadm/2899: [ 173.676016] #0: (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [] migrate_page_move_mapping+0x40/0x1ab [ 173.676016] [ 173.676016] stack backtrace: [ 173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild [ 173.676016] Call Trace: [ 173.676016] [] ? printk+0x14/0x1b [ 173.676016] [] lockdep_rcu_dereference+0x7d/0x86 [ 173.676016] [] migrate_page_move_mapping+0xca/0x1ab [ 173.676016] [] migrate_page+0x23/0x39 [ 173.676016] [] buffer_migrate_page+0x22/0x107 [ 173.676016] [] ? buffer_migrate_page+0x0/0x107 [ 173.676016] [] move_to_new_page+0x9a/0x1ae [ 173.676016] [] migrate_pages+0x1e7/0x2fa This patch introduces radix_tree_deref_slot_protected() which calls rcu_dereference_protected(). Users of it must pass in the mapping->tree_lock that is protecting this dereference. Holding the tree lock protects against parallel updaters of the radix tree meaning that rcu_dereference_protected is allowable. [akpm@linux-foundation.org: remove unneeded casts] Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Milton Miller Cc: Nick Piggin Cc: Wu Fengguang Cc: [2.6.37.early] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5..23241c2 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slot(void **pslot) } /** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + +/** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required -- cgit v1.1 From db16d5ec1f87f17511599bc77857dd1662b5a22f Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:35 -0800 Subject: memcg: add page_cgroup flags for dirty page tracking This patchset provides the ability for each cgroup to have independent dirty page limits. Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) page cache used by a cgroup. So, in case of multiple cgroup writers, they will not be able to consume more than their designated share of dirty pages and will be forced to perform write-out if they cross that limit. The patches are based on a series proposed by Andrea Righi in Mar 2010. Overview: - Add page_cgroup flags to record when pages are dirty, in writeback, or nfs unstable. - Extend mem_cgroup to record the total number of pages in each of the interesting dirty states (dirty, writeback, unstable_nfs). - Add dirty parameters similar to the system-wide /proc/sys/vm/dirty_* limits to mem_cgroup. The mem_cgroup dirty parameters are accessible via cgroupfs control files. - Consider both system and per-memcg dirty limits in page writeback when deciding to queue background writeback or block for foreground writeback. Known shortcomings: - When a cgroup dirty limit is exceeded, then bdi writeback is employed to writeback dirty inodes. Bdi writeback considers inodes from any cgroup, not just inodes contributing dirty pages to the cgroup exceeding its limit. - When memory.use_hierarchy is set, then dirty limits are disabled. This is a implementation detail. An enhanced implementation is needed to check the chain of parents to ensure that no dirty limit is exceeded. Performance data: - A page fault microbenchmark workload was used to measure performance, which can be called in read or write mode: f = open(foo. $cpu) truncate(f, 4096) alarm(60) while (1) { p = mmap(f, 4096) if (write) *p = 1 else x = *p munmap(p) } - The workload was called for several points in the patch series in different modes: - s_read is a single threaded reader - s_write is a single threaded writer - p_read is a 16 thread reader, each operating on a different file - p_write is a 16 thread writer, each operating on a different file - Measurements were collected on a 16 core non-numa system using "perf stat --repeat 3". The -a option was used for parallel (p_*) runs. - All numbers are page fault rate (M/sec). Higher is better. - To compare the performance of a kernel without non-memcg compare the first and last rows, neither has memcg configured. The first row does not include any of these memcg patches. - To compare the performance of using memcg dirty limits, compare the baseline (2nd row titled "w/ memcg") with the the code and memcg enabled (2nd to last row titled "all patches"). root_cgroup child_cgroup s_read s_write p_read p_write s_read s_write p_read p_write mmotm w/o memcg 0.428 0.390 0.429 0.388 mmotm w/ memcg 0.411 0.378 0.391 0.362 0.412 0.377 0.385 0.363 all patches 0.384 0.360 0.370 0.348 0.381 0.363 0.368 0.347 all patches 0.431 0.402 0.427 0.395 w/o memcg This patch: Add additional flags to page_cgroup to track dirty pages within a mem_cgroup. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrea Righi Signed-off-by: Greg Thelen Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b02195d..fdb5a92 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -40,6 +40,9 @@ enum { PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_FILE_DIRTY, /* page is dirty */ + PCG_FILE_WRITEBACK, /* page is under writeback */ + PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ PCG_MIGRATION, /* under page migration */ }; @@ -59,6 +62,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname) \ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -78,6 +85,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(FileDirty, FILE_DIRTY) +CLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTPCGFLAG(FileDirty, FILE_DIRTY) +TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTSETPCGFLAG(FileDirty, FILE_DIRTY) + +SETPCGFLAG(FileWriteback, FILE_WRITEBACK) +CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) +TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) + +SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) + SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) -- cgit v1.1 From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:37 -0800 Subject: memcg: create extensible page stat update routines Replace usage of the mem_cgroup_update_file_mapped() memcg statistic update routine with two new routines: * mem_cgroup_inc_page_stat() * mem_cgroup_dec_page_stat() As before, only the file_mapped statistic is managed. However, these more general interfaces allow for new statistics to be more easily added. New statistics are added with memcg dirty page accounting. Signed-off-by: Greg Thelen Signed-off-by: Andrea Righi Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762..067115c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } -- cgit v1.1 From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 13 Jan 2011 15:47:38 -0800 Subject: memcg: add lock to synchronize page accounting and migration Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page accounting and migration code. This reworks the locking scheme of _update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK, which is always taken under IRQ disable. 1. If pages are being migrated from a memcg, then updates to that memcg page statistics are protected by grabbing PCG_MOVE_LOCK using move_lock_page_cgroup(). In an upcoming commit, memcg dirty page accounting will be updating memcg page accounting (specifically: num writeback pages) from IRQ context (softirq). Avoid a deadlocking nested spin lock attempt by disabling irq on the local processor when grabbing the PCG_MOVE_LOCK. 2. lock for update_page_stat is used only for avoiding race with move_account(). So, IRQ awareness of lock_page_cgroup() itself is not a problem. The problem is between mem_cgroup_update_page_stat() and mem_cgroup_move_account_page(). Trade-off: * Changing lock_page_cgroup() to always disable IRQ (or local_bh) has some impacts on performance and I think it's bad to disable IRQ when it's not necessary. * adding a new lock makes move_account() slower. Score is here. Performance Impact: moving a 8G anon process. Before: real 0m0.792s user 0m0.000s sys 0m0.780s After: real 0m0.854s user 0m0.000s sys 0m0.842s This score is bad but planned patches for optimization can reduce this impact. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Greg Thelen Reviewed-by: Minchan Kim Acked-by: Daisuke Nishimura Cc: Andrea Righi Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index fdb5a92..5b0c971 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_FILE_DIRTY, /* page is dirty */ PCG_FILE_WRITEBACK, /* page is under writeback */ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ - PCG_MIGRATION, /* under page migration */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; -- cgit v1.1 From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:43 -0800 Subject: memcg: fix memory migration of shmem swapcache In the current implementation mem_cgroup_end_migration() decides whether the page migration has succeeded or not by checking "oldpage->mapping". But if we are tring to migrate a shmem swapcache, the page->mapping of it is NULL from the begining, so the check would be invalid. As a result, mem_cgroup_end_migration() assumes the migration has succeeded even if it's not, so "newpage" would be freed while it's not uncharged. This patch fixes it by passing mem_cgroup_end_migration() the result of the page migration. Signed-off-by: Daisuke Nishimura Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Minchan Kim Reviewed-by: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 067115c..6a576f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } -- cgit v1.1 From 9ce137eee4febaabca81143be07d4205d2bd52d4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 11 Jan 2011 14:07:12 -0500 Subject: nfsd: don't support msnfs export option We've long had these pointless #ifdef MSNFS's sprinkled throughout the code--pointless because MSNFS is always defined (and we give no config option to make that easy to change). So we could just remove the ifdef's and compile the resulting code unconditionally. But as long as we're there: why not just rip out this code entirely? The only purpose is to implement the "msnfs" export option which turns on Windows-like behavior in some cases, and: - the export option isn't documented anywhere; - the userland utilities (which would need to be able to parse "msnfs" in an export file) don't support it; - I don't know how to maintain this, as I don't know what the proper behavior is; and - google shows no evidence that anyone has ever used this. Signed-off-by: J. Bruce Fields --- include/linux/nfsd/export.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index 8ae78a6..bd31615 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -35,7 +35,7 @@ #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ -#define NFSEXP_MSNFS 0x1000 /* do silly things that MS clients expect */ +#define NFSEXP_MSNFS 0x1000 /* do silly things that MS clients expect; no longer supported */ #define NFSEXP_FSID 0x2000 #define NFSEXP_CROSSMOUNT 0x4000 #define NFSEXP_NOACL 0x8000 /* reserved for possible ACL related use */ -- cgit v1.1 From 2c6755988afc003a0332406a134fb6a1626f9b28 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 14 Jan 2011 02:36:43 +0000 Subject: fs: hlist UP debug fixup Po-Yu Chuang noticed that hlist_bl_set_first could crash on a UP system when LIST_BL_LOCKMASK is 0, because LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); always evaulates to true. Fix the expression, and also avoid a dependency between bit spinlock implementation and list bl code (list code shouldn't know anything except that bit 0 is set when adding and removing elements). Eventually if a good use case comes up, we might use this list to store 1 or more arbitrary bits of data, so it really shouldn't be tied to locking either, but for now they are helpful for debugging. Signed-off-by: Nick Piggin --- include/linux/list_bl.h | 5 +++-- include/linux/rculist_bl.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 9ee97e7..b2adbb4 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -16,7 +16,7 @@ * some fast and compact auxiliary data. */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#if defined(CONFIG_SMP) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL @@ -62,7 +62,8 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); - LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != + LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index b872b49..cf1244f 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -11,7 +11,8 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); - LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != + LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } -- cgit v1.1 From 1ac9ad1394fa542ac7ae0dc943ee3cda678799fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jan 2011 12:13:14 +0000 Subject: net: remove dev_txq_stats_fold() After recent changes, (percpu stats on vlan/tunnels...), we dont need anymore per struct netdev_queue tx_bytes/tx_packets/tx_dropped counters. Only remaining users are ixgbe, sch_teql, gianfar & macvlan : 1) ixgbe can be converted to use existing tx_ring counters. 2) macvlan incremented txq->tx_dropped, it can use the dev->stats.tx_dropped counter. 3) sch_teql : almost revert ab35cd4b8f42 (Use net_device internal stats) Now we have ndo_get_stats64(), use it, even for "unsigned long" fields (No need to bring back a struct net_device_stats) 4) gianfar adds a stats structure per tx queue to hold tx_bytes/tx_packets This removes a lockdep warning (and possible lockup) in rndis gadget, calling dev_get_stats() from hard IRQ context. Ref: http://www.spinics.net/lists/netdev/msg149202.html Reported-by: Neil Jones Signed-off-by: Eric Dumazet CC: Jarek Poplawski CC: Alexander Duyck CC: Jeff Kirsher CC: Sandeep Gopalpet CC: Michal Nazarewicz Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be4957c..d971346 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -520,9 +520,6 @@ struct netdev_queue { * please use this field instead of dev->trans_start */ unsigned long trans_start; - u64 tx_bytes; - u64 tx_packets; - u64 tx_dropped; } ____cacheline_aligned_in_smp; static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) @@ -2265,8 +2262,6 @@ extern void dev_load(struct net *net, const char *name); extern void dev_mcast_init(void); extern struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); -extern void dev_txq_stats_fold(const struct net_device *dev, - struct rtnl_link_stats64 *stats); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; -- cgit v1.1 From 78d07369462e9feeaa5db301b0aa70e9dcb40b48 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 13 Jan 2011 11:51:03 +0000 Subject: ipsec: update MAX_AH_AUTH_LEN to support sha512 icv_truncbits is set to 256 for sha512, so update MAX_AH_AUTH_LEN to 64. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ah.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ah.h b/include/net/ah.h index be7798d..ca95b98 100644 --- a/include/net/ah.h +++ b/include/net/ah.h @@ -4,7 +4,7 @@ #include /* This is the maximum truncated ICV length that we know of. */ -#define MAX_AH_AUTH_LEN 16 +#define MAX_AH_AUTH_LEN 64 struct crypto_ahash; -- cgit v1.1 From 51e7eed79c41180919ff94942895ba38467d9ad4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 12 Jan 2011 22:14:56 +0000 Subject: etherdevice.h: Add is_unicast_ether_addr function From a check for !is_multicast_ether_addr it is not always obvious that we're checking for a unicast address. So add this helper function to make those code paths easier to read. Signed-off-by: Tobias Klauser Acked-by: Chris Metcalf Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index bec8b82..ab68f78 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -99,6 +99,17 @@ static inline int is_broadcast_ether_addr(const u8 *addr) } /** + * is_unicast_ether_addr - Determine if the Ethernet address is unicast + * @addr: Pointer to a six-byte array containing the Ethernet address + * + * Return true if the address is a unicast address. + */ +static inline int is_unicast_ether_addr(const u8 *addr) +{ + return !is_multicast_ether_addr(addr); +} + +/** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * -- cgit v1.1 From bba958783b1b4cb0a9420f4e11082467132a334c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 14 Jan 2011 15:57:47 +0900 Subject: mmc: sh_mmcif: Convert to __raw_xxx() I/O accessors. When using the I/O accessors in raw mode from the boot stubs we don't want to bother with any of the complexity associated with readl/writel and friends. Furthermore, utilization within the context of the host driver itself is all performed on an ioremapped window, so using the __raw variants there doesn't pose any problem either. If and when barriers need to be added in the future, these will need to be explicitly written out, but this is so far not a concern for any of the affected CPUs in question. This fixes up the link error introduced by the ARM tree via its barrier refactoring: arch/arm/boot/compressed/mmcif-sh7372.o: In function `mmcif_loader': mmcif-sh7372.c:(.text+0x9e8): undefined reference to `outer_cache Following the change in: http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=6275/1 Reported-by: Simon Horman Signed-off-by: Paul Mundt --- include/linux/mmc/sh_mmcif.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h index bf17350..38d3930 100644 --- a/include/linux/mmc/sh_mmcif.h +++ b/include/linux/mmc/sh_mmcif.h @@ -94,12 +94,12 @@ struct sh_mmcif_plat_data { static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) { - return readl(addr + reg); + return __raw_readl(addr + reg); } static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) { - writel(val, addr + reg); + __raw_writel(val, addr + reg); } #define SH_MMCIF_BBS 512 /* boot block size */ -- cgit v1.1 From 412dc11d3fd01f96fdf4a8cbfbc5584a17dab7c8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 24 Nov 2010 18:01:41 +0000 Subject: mfd: Add WM8326 support The WM8326 is a high performance variant of the WM832x series with no software visible differences. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- include/linux/mfd/wm831x/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h index a1239c4..903280d 100644 --- a/include/linux/mfd/wm831x/core.h +++ b/include/linux/mfd/wm831x/core.h @@ -245,6 +245,7 @@ enum wm831x_parent { WM8320 = 0x8320, WM8321 = 0x8321, WM8325 = 0x8325, + WM8326 = 0x8326, }; struct wm831x { -- cgit v1.1 From 4c90aa94f6b3e33f57faaf19ef9819195dff61d3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 26 Nov 2010 17:19:34 +0000 Subject: mfd: Provide pm_runtime_no_callbacks flag in cell data Allow MFD cells to have pm_runtime_no_callbacks() called on them during registration. This causes the runtime PM framework to ignore them, allowing use of runtime PM to suspend the device as a whole even if not all drivers for the MFD can usefully implement runtime PM. For example, RTCs are likely to run continuously regardless of the power state of the system. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- include/linux/mfd/core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index 5582ab3..835996e 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -47,6 +47,12 @@ struct mfd_cell { /* don't check for resource conflicts */ bool ignore_resource_conflicts; + + /* + * Disable runtime PM callbacks for this subdevice - see + * pm_runtime_no_callbacks(). + */ + bool pm_runtime_no_callbacks; }; extern int mfd_add_devices(struct device *parent, int id, -- cgit v1.1 From e098aded79f24e2024139e82f778ff9db6dc142a Mon Sep 17 00:00:00 2001 From: Mattias Wallin Date: Thu, 2 Dec 2010 15:40:31 +0100 Subject: mfd: ab8500-core ioresources irq for subdrivers added This patch adds the ioresources used by subdrivers to retrieve their interrupt. Signed-off-by: Mattias Wallin Signed-off-by: Samuel Ortiz --- include/linux/mfd/ab8500.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h index 85cf2c28..1ad1696 100644 --- a/include/linux/mfd/ab8500.h +++ b/include/linux/mfd/ab8500.h @@ -91,8 +91,8 @@ #define AB8500_INT_ID_DET_R4F 93 #define AB8500_INT_USB_CHG_DET_DONE 94 #define AB8500_INT_USB_CH_TH_PROT_F 96 -#define AB8500_INT_USB_CH_TH_PROP_R 97 -#define AB8500_INT_MAIN_CH_TH_PROP_F 98 +#define AB8500_INT_USB_CH_TH_PROT_R 97 +#define AB8500_INT_MAIN_CH_TH_PROT_F 98 #define AB8500_INT_MAIN_CH_TH_PROT_R 99 #define AB8500_INT_USB_CHARGER_NOT_OKF 103 -- cgit v1.1 From cdd137c9c86c201ddb7f42ec978d2da45e7b7a17 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Thu, 23 Dec 2010 17:53:36 +0900 Subject: mfd: MAX8998/LP3974 hibernation support This patch makes the driver to save and restore register values for hibernation. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Signed-off-by: Samuel Ortiz --- include/linux/mfd/max8998-private.h | 2 ++ include/linux/mfd/max8998.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h index 7363dea..effa5d3 100644 --- a/include/linux/mfd/max8998-private.h +++ b/include/linux/mfd/max8998-private.h @@ -159,10 +159,12 @@ struct max8998_dev { u8 irq_masks_cur[MAX8998_NUM_IRQ_REGS]; u8 irq_masks_cache[MAX8998_NUM_IRQ_REGS]; int type; + bool wakeup; }; int max8998_irq_init(struct max8998_dev *max8998); void max8998_irq_exit(struct max8998_dev *max8998); +int max8998_irq_resume(struct max8998_dev *max8998); extern int max8998_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest); extern int max8998_bulk_read(struct i2c_client *i2c, u8 reg, int count, diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index f8c9f88..686a744 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -88,6 +88,7 @@ struct max8998_platform_data { int buck1_set1; int buck1_set2; int buck2_set3; + bool wakeup; }; #endif /* __LINUX_MFD_MAX8998_H */ -- cgit v1.1 From 337ce5d1c5759644cea6c47220ce7e84f0398362 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Tue, 4 Jan 2011 14:17:39 +0900 Subject: mfd: Support LP3974 RTC The first releases of LP3974 have a large delay in RTC registers, which requires 2 seconds of delay after writing to a rtc register (recommended by National Semiconductor's engineers) before reading it. If "rtc_delay" field of the platform data is true, the rtc driver assumes that such delays are required. Although we have not seen LP3974s without requiring such delays, we assume that such LP3974s will be released soon (or they have done so already) and they are supported by "lp3974" without setting "rtc_delay" at the platform data. This patch adds delays with msleep when writing values to RTC registers if the platform data has rtc_delay set. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Reviewed-by: Mark Brown Signed-off-by: Samuel Ortiz --- include/linux/mfd/max8998.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index 686a744..c22b9a4 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -76,6 +76,9 @@ struct max8998_regulator_data { * @buck1_set1: BUCK1 gpio pin 1 to set output voltage * @buck1_set2: BUCK1 gpio pin 2 to set output voltage * @buck2_set3: BUCK2 gpio pin to set output voltage + * @wakeup: Allow to wake up from suspend + * @rtc_delay: LP3974 RTC chip bug that requires delay after a register + * write before reading it. */ struct max8998_platform_data { struct max8998_regulator_data *regulators; @@ -89,6 +92,7 @@ struct max8998_platform_data { int buck1_set2; int buck2_set3; bool wakeup; + bool rtc_delay; }; #endif /* __LINUX_MFD_MAX8998_H */ -- cgit v1.1 From 735a3d9efdc5aeebe201008e6655b235c7f02aeb Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Tue, 11 Jan 2011 12:20:05 +0100 Subject: regulator: Support MAX8998/LP3974 DVS-GPIO The previous driver did not support BUCK1-DVS3, BUCK1-DVS4, and BUCK2-DVS2 modes. This patch adds such modes and an option to block setting buck1/2 voltages out of the preset values. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Acked-by: Mark Brown Signed-off-by: Samuel Ortiz --- include/linux/mfd/max8998.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index c22b9a4..61daa16 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -70,12 +70,20 @@ struct max8998_regulator_data { * @num_regulators: number of regultors used * @irq_base: base IRQ number for max8998, required for IRQs * @ono: power onoff IRQ number for max8998 - * @buck1_max_voltage1: BUCK1 maximum alowed voltage register 1 - * @buck1_max_voltage2: BUCK1 maximum alowed voltage register 2 - * @buck2_max_voltage: BUCK2 maximum alowed voltage + * @buck_voltage_lock: Do NOT change the values of the following six + * registers set by buck?_voltage?. The voltage of BUCK1/2 cannot + * be other than the preset values. + * @buck1_voltage1: BUCK1 DVS mode 1 voltage register + * @buck1_voltage2: BUCK1 DVS mode 2 voltage register + * @buck1_voltage3: BUCK1 DVS mode 3 voltage register + * @buck1_voltage4: BUCK1 DVS mode 4 voltage register + * @buck2_voltage1: BUCK2 DVS mode 1 voltage register + * @buck2_voltage2: BUCK2 DVS mode 2 voltage register * @buck1_set1: BUCK1 gpio pin 1 to set output voltage * @buck1_set2: BUCK1 gpio pin 2 to set output voltage + * @buck1_default_idx: Default for BUCK1 gpio pin 1, 2 * @buck2_set3: BUCK2 gpio pin to set output voltage + * @buck2_default_idx: Default for BUCK2 gpio pin. * @wakeup: Allow to wake up from suspend * @rtc_delay: LP3974 RTC chip bug that requires delay after a register * write before reading it. @@ -85,12 +93,18 @@ struct max8998_platform_data { int num_regulators; int irq_base; int ono; - int buck1_max_voltage1; - int buck1_max_voltage2; - int buck2_max_voltage; + bool buck_voltage_lock; + int buck1_voltage1; + int buck1_voltage2; + int buck1_voltage3; + int buck1_voltage4; + int buck2_voltage1; + int buck2_voltage2; int buck1_set1; int buck1_set2; + int buck1_default_idx; int buck2_set3; + int buck2_default_idx; bool wakeup; bool rtc_delay; }; -- cgit v1.1 From 92d50a4132977b932ed830fa58c05deeb5c524f0 Mon Sep 17 00:00:00 2001 From: Mattias Wallin Date: Tue, 7 Dec 2010 11:20:47 +0100 Subject: mfd: ab8500-core chip version cut 2.0 support This patch adds support for chip version 2.0 or cut 2.0. One new interrupt latch register - latch 12 - is introduced. Signed-off-by: Mattias Wallin Acked-by: Linus Walleij Signed-off-by: Samuel Ortiz --- include/linux/mfd/ab8500.h | 53 ++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h index 1ad1696..37f56b7 100644 --- a/include/linux/mfd/ab8500.h +++ b/include/linux/mfd/ab8500.h @@ -74,30 +74,37 @@ #define AB8500_INT_ACC_DETECT_21DB_F 37 #define AB8500_INT_ACC_DETECT_21DB_R 38 #define AB8500_INT_GP_SW_ADC_CONV_END 39 -#define AB8500_INT_BTEMP_LOW 72 -#define AB8500_INT_BTEMP_LOW_MEDIUM 73 -#define AB8500_INT_BTEMP_MEDIUM_HIGH 74 -#define AB8500_INT_BTEMP_HIGH 75 -#define AB8500_INT_USB_CHARGER_NOT_OK 81 -#define AB8500_INT_ID_WAKEUP_R 82 -#define AB8500_INT_ID_DET_R1R 84 -#define AB8500_INT_ID_DET_R2R 85 -#define AB8500_INT_ID_DET_R3R 86 -#define AB8500_INT_ID_DET_R4R 87 -#define AB8500_INT_ID_WAKEUP_F 88 -#define AB8500_INT_ID_DET_R1F 90 -#define AB8500_INT_ID_DET_R2F 91 -#define AB8500_INT_ID_DET_R3F 92 -#define AB8500_INT_ID_DET_R4F 93 -#define AB8500_INT_USB_CHG_DET_DONE 94 -#define AB8500_INT_USB_CH_TH_PROT_F 96 -#define AB8500_INT_USB_CH_TH_PROT_R 97 -#define AB8500_INT_MAIN_CH_TH_PROT_F 98 -#define AB8500_INT_MAIN_CH_TH_PROT_R 99 -#define AB8500_INT_USB_CHARGER_NOT_OKF 103 +#define AB8500_INT_ADP_SOURCE_ERROR 72 +#define AB8500_INT_ADP_SINK_ERROR 73 +#define AB8500_INT_ADP_PROBE_PLUG 74 +#define AB8500_INT_ADP_PROBE_UNPLUG 75 +#define AB8500_INT_ADP_SENSE_OFF 76 +#define AB8500_INT_USB_PHY_POWER_ERR 78 +#define AB8500_INT_USB_LINK_STATUS 79 +#define AB8500_INT_BTEMP_LOW 80 +#define AB8500_INT_BTEMP_LOW_MEDIUM 81 +#define AB8500_INT_BTEMP_MEDIUM_HIGH 82 +#define AB8500_INT_BTEMP_HIGH 83 +#define AB8500_INT_USB_CHARGER_NOT_OK 89 +#define AB8500_INT_ID_WAKEUP_R 90 +#define AB8500_INT_ID_DET_R1R 92 +#define AB8500_INT_ID_DET_R2R 93 +#define AB8500_INT_ID_DET_R3R 94 +#define AB8500_INT_ID_DET_R4R 95 +#define AB8500_INT_ID_WAKEUP_F 96 +#define AB8500_INT_ID_DET_R1F 98 +#define AB8500_INT_ID_DET_R2F 99 +#define AB8500_INT_ID_DET_R3F 100 +#define AB8500_INT_ID_DET_R4F 101 +#define AB8500_INT_USB_CHG_DET_DONE 102 +#define AB8500_INT_USB_CH_TH_PROT_F 104 +#define AB8500_INT_USB_CH_TH_PROT_R 105 +#define AB8500_INT_MAIN_CH_TH_PROT_F 106 +#define AB8500_INT_MAIN_CH_TH_PROT_R 107 +#define AB8500_INT_USB_CHARGER_NOT_OKF 111 -#define AB8500_NR_IRQS 104 -#define AB8500_NUM_IRQ_REGS 13 +#define AB8500_NR_IRQS 112 +#define AB8500_NUM_IRQ_REGS 14 /** * struct ab8500 - ab8500 internal structure -- cgit v1.1 From 32385c7cf60a78375b63afc4f02001df84dfd1a0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 14 Jan 2011 13:12:45 +0000 Subject: kernel: fix hlist_bl again __d_rehash is dereferencing an almost-NULL pointer on my ARM926. CONFIG_SMP=n and CONFIG_DEBUG_SPINLOCK=y. The faulting instruction is: strne r3, [r2, #4] and as can be seen from the register dump below, r2 is 0x00000001, hence the faulting 0x00000005 address. __d_rehash is essentially: spin_lock_bucket(b); entry->d_flags &= ~DCACHE_UNHASHED; hlist_bl_add_head_rcu(&entry->d_hash, &b->head); spin_unlock_bucket(b); which is: bit_spin_lock(0, (unsigned long *)&b->head.first); entry->d_flags &= ~DCACHE_UNHASHED; hlist_bl_add_head_rcu(&entry->d_hash, &b->head); __bit_spin_unlock(0, (unsigned long *)&b->head.first); bit_spin_lock(0, ptr) sets bit 0 of *ptr, in this case b->head.first if CONFIG_SMP or CONFIG_DEBUG_SPINLOCK is set: #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { while (test_bit(bitnum, addr)) { preempt_enable(); cpu_relax(); preempt_disable(); } } #endif So, b->head.first starts off NULL, and becomes a non-NULL (address 1). hlist_bl_add_head_rcu() does this: static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; It is the store to first->pprev which is faulting. hlist_bl_first(): static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } but: #if defined(CONFIG_SMP) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif So, we have one piece of code which sets bit 0 of addresses, and another bit of code which doesn't clear it before dereferencing the pointer if !CONFIG_SMP && CONFIG_DEBUG_SPINLOCK. With the patch below, I can again sucessfully boot the kernel on my Versatile PB/926 platform. Signed-off-by: Russell King --- include/linux/list_bl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index b2adbb4..5bad17d1 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -16,7 +16,7 @@ * some fast and compact auxiliary data. */ -#if defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL -- cgit v1.1 From 359ab9f5b154cbd807a11e22792235f0f36b0cd5 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Fri, 14 Jan 2011 14:46:11 +0900 Subject: power_supply: Add MAX17042 Fuel Gauge Driver The MAX17042 is a fuel gauge with an I2C interface for lithium-ion betteries. Unlike its predecessor MAX17040, MAX17042 uses 16bit registers. Besides, MAX17042 has much more features than MAX17040; e.g., a thermistor, current and current accumulation measurement, battery internal resistance estimate, average values of measurement, and others. This patch implements a driver for MAX17042. In this initial release, we have implemented the most basic features of a fuel gauge: measure the battery capacity and voltage. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Signed-off-by: Anton Vorontsov --- include/linux/power/max17042_battery.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/linux/power/max17042_battery.h (limited to 'include') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h new file mode 100644 index 0000000..7995deb --- /dev/null +++ b/include/linux/power/max17042_battery.h @@ -0,0 +1,30 @@ +/* + * Fuel gauge driver for Maxim 17042 / 8966 / 8997 + * Note that Maxim 8966 and 8997 are mfd and this is its subdevice. + * + * Copyright (C) 2011 Samsung Electronics + * MyungJoo Ham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __MAX17042_BATTERY_H_ +#define __MAX17042_BATTERY_H_ + +struct max17042_platform_data { + bool enable_current_sense; +}; + +#endif /* __MAX17042_BATTERY_H_ */ -- cgit v1.1 From 836cb711ad7960e52625b24195d6e70b79ab0816 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 14 Jan 2011 14:56:53 +0900 Subject: Revert update for dirty_ratio for memcg. The flags added by commit db16d5ec1f87f17511599bc77857dd1662b5a22f has no user now. We believe we'll use it soon but considering patch reviewing, the change itself should be folded into incoming set of "dirty ratio for memcg" patches. So, it's better to drop this change from current mainline tree. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Greg Thelen Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 5b0c971..6d6cb7a 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -42,9 +42,6 @@ enum { /* flags for mem_cgroup and file and I/O status */ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ - PCG_FILE_DIRTY, /* page is dirty */ - PCG_FILE_WRITEBACK, /* page is under writeback */ - PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ /* No lock in page_cgroup */ PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; @@ -65,10 +62,6 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } -#define TESTSETPCGFLAG(uname, lname) \ -static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ - { return test_and_set_bit(PCG_##lname, &pc->flags); } - /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -88,22 +81,6 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) -SETPCGFLAG(FileDirty, FILE_DIRTY) -CLEARPCGFLAG(FileDirty, FILE_DIRTY) -TESTPCGFLAG(FileDirty, FILE_DIRTY) -TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) -TESTSETPCGFLAG(FileDirty, FILE_DIRTY) - -SETPCGFLAG(FileWriteback, FILE_WRITEBACK) -CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) -TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) - -SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) -CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) -TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) -TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) -TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) - SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) -- cgit v1.1 From 323b7fe8f8f6d5ac6214382cf30e8b3a80b265c9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 14 Jan 2011 09:34:29 +0100 Subject: include/gpio.h: remove remaining __must_check-annotiations Commit 5f829e405ec4e96f711165a4a7b55c271d4363e2 (gpiolib: add missing functions to generic fallback) also introduced two. Signed-off-by: Wolfram Sang Cc: Greg KH Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gpio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 4b47ed9..32720ba 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -35,13 +35,13 @@ static inline int gpio_request(unsigned gpio, const char *label) return -ENOSYS; } -static inline int __must_check gpio_request_one(unsigned gpio, +static inline int gpio_request_one(unsigned gpio, unsigned long flags, const char *label) { return -ENOSYS; } -static inline int __must_check gpio_request_array(struct gpio *array, size_t num) +static inline int gpio_request_array(struct gpio *array, size_t num) { return -ENOSYS; } -- cgit v1.1 From c66ac9db8d4ad9994a02b3e933ea2ccc643e1fe5 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 17 Dec 2010 11:11:26 -0800 Subject: [SCSI] target: Add LIO target core v4.0.0-rc6 LIO target is a full featured in-kernel target framework with the following feature set: High-performance, non-blocking, multithreaded architecture with SIMD support. Advanced SCSI feature set: * Persistent Reservations (PRs) * Asymmetric Logical Unit Assignment (ALUA) * Protocol and intra-nexus multiplexing, load-balancing and failover (MC/S) * Full Error Recovery (ERL=0,1,2) * Active/active task migration and session continuation (ERL=2) * Thin LUN provisioning (UNMAP and WRITE_SAMExx) Multiprotocol target plugins Storage media independence: * Virtualization of all storage media; transparent mapping of IO to LUNs * No hard limits on number of LUNs per Target; maximum LUN size ~750 TB * Backstores: SATA, SAS, SCSI, BluRay, DVD, FLASH, USB, ramdisk, etc. Standards compliance: * Full compliance with IETF (RFC 3720) * Full implementation of SPC-4 PRs and ALUA Significant code cleanups done by Christoph Hellwig. [jejb: fix up for new block bdev exclusive interface. Minor fixes from Randy Dunlap and Dan Carpenter.] Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- include/target/configfs_macros.h | 147 +++++ include/target/target_core_base.h | 937 +++++++++++++++++++++++++++ include/target/target_core_configfs.h | 52 ++ include/target/target_core_device.h | 61 ++ include/target/target_core_fabric_configfs.h | 106 +++ include/target/target_core_fabric_lib.h | 28 + include/target/target_core_fabric_ops.h | 100 +++ include/target/target_core_tmr.h | 43 ++ include/target/target_core_tpg.h | 35 + include/target/target_core_transport.h | 351 ++++++++++ 10 files changed, 1860 insertions(+) create mode 100644 include/target/configfs_macros.h create mode 100644 include/target/target_core_base.h create mode 100644 include/target/target_core_configfs.h create mode 100644 include/target/target_core_device.h create mode 100644 include/target/target_core_fabric_configfs.h create mode 100644 include/target/target_core_fabric_lib.h create mode 100644 include/target/target_core_fabric_ops.h create mode 100644 include/target/target_core_tmr.h create mode 100644 include/target/target_core_tpg.h create mode 100644 include/target/target_core_transport.h (limited to 'include') diff --git a/include/target/configfs_macros.h b/include/target/configfs_macros.h new file mode 100644 index 0000000..7fe7460 --- /dev/null +++ b/include/target/configfs_macros.h @@ -0,0 +1,147 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * configfs_macros.h - extends macros for configfs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * Based on kobject.h: + * Copyright (c) 2002-2003 Patrick Mochel + * Copyright (c) 2002-2003 Open Source Development Labs + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Added CONFIGFS_EATTR() macros from original configfs.h macros + * Copright (C) 2008-2009 Nicholas A. Bellinger + * + * Please read Documentation/filesystems/configfs.txt before using the + * configfs interface, ESPECIALLY the parts about reference counts and + * item destructors. + */ + +#ifndef _CONFIGFS_MACROS_H_ +#define _CONFIGFS_MACROS_H_ + +#include + +/* + * Users often need to create attribute structures for their configurable + * attributes, containing a configfs_attribute member and function pointers + * for the show() and store() operations on that attribute. If they don't + * need anything else on the extended attribute structure, they can use + * this macro to define it. The argument _name isends up as + * 'struct _name_attribute, as well as names of to CONFIGFS_ATTR_OPS() below. + * The argument _item is the name of the structure containing the + * struct config_item or struct config_group structure members + */ +#define CONFIGFS_EATTR_STRUCT(_name, _item) \ +struct _name##_attribute { \ + struct configfs_attribute attr; \ + ssize_t (*show)(struct _item *, char *); \ + ssize_t (*store)(struct _item *, const char *, size_t); \ +} + +/* + * With the extended attribute structure, users can use this macro + * (similar to sysfs' __ATTR) to make defining attributes easier. + * An example: + * #define MYITEM_EATTR(_name, _mode, _show, _store) \ + * struct myitem_attribute childless_attr_##_name = \ + * __CONFIGFS_EATTR(_name, _mode, _show, _store) + */ +#define __CONFIGFS_EATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { \ + .ca_name = __stringify(_name), \ + .ca_mode = _mode, \ + .ca_owner = THIS_MODULE, \ + }, \ + .show = _show, \ + .store = _store, \ +} +/* Here is a readonly version, only requiring a show() operation */ +#define __CONFIGFS_EATTR_RO(_name, _show) \ +{ \ + .attr = { \ + .ca_name = __stringify(_name), \ + .ca_mode = 0444, \ + .ca_owner = THIS_MODULE, \ + }, \ + .show = _show, \ +} + +/* + * With these extended attributes, the simple show_attribute() and + * store_attribute() operations need to call the show() and store() of the + * attributes. This is a common pattern, so we provide a macro to define + * them. The argument _name is the name of the attribute defined by + * CONFIGFS_ATTR_STRUCT(). The argument _item is the name of the structure + * containing the struct config_item or struct config_group structure member. + * The argument _item_member is the actual name of the struct config_* struct + * in your _item structure. Meaning my_structure->some_config_group. + * ^^_item^^^^^ ^^_item_member^^^ + * This macro expects the attributes to be named "struct _attribute". + */ +#define CONFIGFS_EATTR_OPS_TO_FUNC(_name, _item, _item_member) \ +static struct _item *to_##_name(struct config_item *ci) \ +{ \ + return (ci) ? container_of(to_config_group(ci), struct _item, \ + _item_member) : NULL; \ +} + +#define CONFIGFS_EATTR_OPS_SHOW(_name, _item) \ +static ssize_t _name##_attr_show(struct config_item *item, \ + struct configfs_attribute *attr, \ + char *page) \ +{ \ + struct _item *_item = to_##_name(item); \ + struct _name##_attribute * _name##_attr = \ + container_of(attr, struct _name##_attribute, attr); \ + ssize_t ret = 0; \ + \ + if (_name##_attr->show) \ + ret = _name##_attr->show(_item, page); \ + return ret; \ +} + +#define CONFIGFS_EATTR_OPS_STORE(_name, _item) \ +static ssize_t _name##_attr_store(struct config_item *item, \ + struct configfs_attribute *attr, \ + const char *page, size_t count) \ +{ \ + struct _item *_item = to_##_name(item); \ + struct _name##_attribute * _name##_attr = \ + container_of(attr, struct _name##_attribute, attr); \ + ssize_t ret = -EINVAL; \ + \ + if (_name##_attr->store) \ + ret = _name##_attr->store(_item, page, count); \ + return ret; \ +} + +#define CONFIGFS_EATTR_OPS(_name, _item, _item_member) \ + CONFIGFS_EATTR_OPS_TO_FUNC(_name, _item, _item_member); \ + CONFIGFS_EATTR_OPS_SHOW(_name, _item); \ + CONFIGFS_EATTR_OPS_STORE(_name, _item); + +#define CONFIGFS_EATTR_OPS_RO(_name, _item, _item_member) \ + CONFIGFS_EATTR_OPS_TO_FUNC(_name, _item, _item_member); \ + CONFIGFS_EATTR_OPS_SHOW(_name, _item); + +#endif /* _CONFIGFS_MACROS_H_ */ diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h new file mode 100644 index 0000000..07fdfb6 --- /dev/null +++ b/include/target/target_core_base.h @@ -0,0 +1,937 @@ +#ifndef TARGET_CORE_BASE_H +#define TARGET_CORE_BASE_H + +#include +#include +#include +#include +#include +#include +#include +#include "target_core_mib.h" + +#define TARGET_CORE_MOD_VERSION "v4.0.0-rc6" +#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGABRT)) + +/* Used by transport_generic_allocate_iovecs() */ +#define TRANSPORT_IOV_DATA_BUFFER 5 +/* Maximum Number of LUNs per Target Portal Group */ +#define TRANSPORT_MAX_LUNS_PER_TPG 256 +/* + * By default we use 32-byte CDBs in TCM Core and subsystem plugin code. + * + * Note that both include/scsi/scsi_cmnd.h:MAX_COMMAND_SIZE and + * include/linux/blkdev.h:BLOCK_MAX_CDB as of v2.6.36-rc4 still use + * 16-byte CDBs by default and require an extra allocation for + * 32-byte CDBs to becasue of legacy issues. + * + * Within TCM Core there are no such legacy limitiations, so we go ahead + * use 32-byte CDBs by default and use include/scsi/scsi.h:scsi_command_size() + * within all TCM Core and subsystem plugin code. + */ +#define TCM_MAX_COMMAND_SIZE 32 +/* + * From include/scsi/scsi_cmnd.h:SCSI_SENSE_BUFFERSIZE, currently + * defined 96, but the real limit is 252 (or 260 including the header) + */ +#define TRANSPORT_SENSE_BUFFER SCSI_SENSE_BUFFERSIZE +/* Used by transport_send_check_condition_and_sense() */ +#define SPC_SENSE_KEY_OFFSET 2 +#define SPC_ASC_KEY_OFFSET 12 +#define SPC_ASCQ_KEY_OFFSET 13 +#define TRANSPORT_IQN_LEN 224 +/* Used by target_core_store_alua_lu_gp() and target_core_alua_lu_gp_show_attr_members() */ +#define LU_GROUP_NAME_BUF 256 +/* Used by core_alua_store_tg_pt_gp_info() and target_core_alua_tg_pt_gp_show_attr_members() */ +#define TG_PT_GROUP_NAME_BUF 256 +/* Used to parse VPD into struct t10_vpd */ +#define VPD_TMP_BUF_SIZE 128 +/* Used by transport_generic_cmd_sequencer() */ +#define READ_BLOCK_LEN 6 +#define READ_CAP_LEN 8 +#define READ_POSITION_LEN 20 +#define INQUIRY_LEN 36 +/* Used by transport_get_inquiry_vpd_serial() */ +#define INQUIRY_VPD_SERIAL_LEN 254 +/* Used by transport_get_inquiry_vpd_device_ident() */ +#define INQUIRY_VPD_DEVICE_IDENTIFIER_LEN 254 + +/* struct se_hba->hba_flags */ +enum hba_flags_table { + HBA_FLAGS_INTERNAL_USE = 0x01, + HBA_FLAGS_PSCSI_MODE = 0x02, +}; + +/* struct se_lun->lun_status */ +enum transport_lun_status_table { + TRANSPORT_LUN_STATUS_FREE = 0, + TRANSPORT_LUN_STATUS_ACTIVE = 1, +}; + +/* struct se_portal_group->se_tpg_type */ +enum transport_tpg_type_table { + TRANSPORT_TPG_TYPE_NORMAL = 0, + TRANSPORT_TPG_TYPE_DISCOVERY = 1, +}; + +/* Used for generate timer flags */ +enum timer_flags_table { + TF_RUNNING = 0x01, + TF_STOP = 0x02, +}; + +/* Special transport agnostic struct se_cmd->t_states */ +enum transport_state_table { + TRANSPORT_NO_STATE = 0, + TRANSPORT_NEW_CMD = 1, + TRANSPORT_DEFERRED_CMD = 2, + TRANSPORT_WRITE_PENDING = 3, + TRANSPORT_PROCESS_WRITE = 4, + TRANSPORT_PROCESSING = 5, + TRANSPORT_COMPLETE_OK = 6, + TRANSPORT_COMPLETE_FAILURE = 7, + TRANSPORT_COMPLETE_TIMEOUT = 8, + TRANSPORT_PROCESS_TMR = 9, + TRANSPORT_TMR_COMPLETE = 10, + TRANSPORT_ISTATE_PROCESSING = 11, + TRANSPORT_ISTATE_PROCESSED = 12, + TRANSPORT_KILL = 13, + TRANSPORT_REMOVE = 14, + TRANSPORT_FREE = 15, + TRANSPORT_NEW_CMD_MAP = 16, +}; + +/* Used for struct se_cmd->se_cmd_flags */ +enum se_cmd_flags_table { + SCF_SUPPORTED_SAM_OPCODE = 0x00000001, + SCF_TRANSPORT_TASK_SENSE = 0x00000002, + SCF_EMULATED_TASK_SENSE = 0x00000004, + SCF_SCSI_DATA_SG_IO_CDB = 0x00000008, + SCF_SCSI_CONTROL_SG_IO_CDB = 0x00000010, + SCF_SCSI_CONTROL_NONSG_IO_CDB = 0x00000020, + SCF_SCSI_NON_DATA_CDB = 0x00000040, + SCF_SCSI_CDB_EXCEPTION = 0x00000080, + SCF_SCSI_RESERVATION_CONFLICT = 0x00000100, + SCF_CMD_PASSTHROUGH_NOALLOC = 0x00000200, + SCF_SE_CMD_FAILED = 0x00000400, + SCF_SE_LUN_CMD = 0x00000800, + SCF_SE_ALLOW_EOO = 0x00001000, + SCF_SE_DISABLE_ONLINE_CHECK = 0x00002000, + SCF_SENT_CHECK_CONDITION = 0x00004000, + SCF_OVERFLOW_BIT = 0x00008000, + SCF_UNDERFLOW_BIT = 0x00010000, + SCF_SENT_DELAYED_TAS = 0x00020000, + SCF_ALUA_NON_OPTIMIZED = 0x00040000, + SCF_DELAYED_CMD_FROM_SAM_ATTR = 0x00080000, + SCF_PASSTHROUGH_SG_TO_MEM = 0x00100000, + SCF_PASSTHROUGH_CONTIG_TO_SG = 0x00200000, + SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC = 0x00400000, + SCF_EMULATE_SYNC_CACHE = 0x00800000, + SCF_EMULATE_CDB_ASYNC = 0x01000000, + SCF_EMULATE_SYNC_UNMAP = 0x02000000 +}; + +/* struct se_dev_entry->lun_flags and struct se_lun->lun_access */ +enum transport_lunflags_table { + TRANSPORT_LUNFLAGS_NO_ACCESS = 0x00, + TRANSPORT_LUNFLAGS_INITIATOR_ACCESS = 0x01, + TRANSPORT_LUNFLAGS_READ_ONLY = 0x02, + TRANSPORT_LUNFLAGS_READ_WRITE = 0x04, +}; + +/* struct se_device->dev_status */ +enum transport_device_status_table { + TRANSPORT_DEVICE_ACTIVATED = 0x01, + TRANSPORT_DEVICE_DEACTIVATED = 0x02, + TRANSPORT_DEVICE_QUEUE_FULL = 0x04, + TRANSPORT_DEVICE_SHUTDOWN = 0x08, + TRANSPORT_DEVICE_OFFLINE_ACTIVATED = 0x10, + TRANSPORT_DEVICE_OFFLINE_DEACTIVATED = 0x20, +}; + +/* + * Used by transport_send_check_condition_and_sense() and se_cmd->scsi_sense_reason + * to signal which ASC/ASCQ sense payload should be built. + */ +enum tcm_sense_reason_table { + TCM_NON_EXISTENT_LUN = 0x01, + TCM_UNSUPPORTED_SCSI_OPCODE = 0x02, + TCM_INCORRECT_AMOUNT_OF_DATA = 0x03, + TCM_UNEXPECTED_UNSOLICITED_DATA = 0x04, + TCM_SERVICE_CRC_ERROR = 0x05, + TCM_SNACK_REJECTED = 0x06, + TCM_SECTOR_COUNT_TOO_MANY = 0x07, + TCM_INVALID_CDB_FIELD = 0x08, + TCM_INVALID_PARAMETER_LIST = 0x09, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE = 0x0a, + TCM_UNKNOWN_MODE_PAGE = 0x0b, + TCM_WRITE_PROTECTED = 0x0c, + TCM_CHECK_CONDITION_ABORT_CMD = 0x0d, + TCM_CHECK_CONDITION_UNIT_ATTENTION = 0x0e, + TCM_CHECK_CONDITION_NOT_READY = 0x0f, +}; + +struct se_obj { + atomic_t obj_access_count; +} ____cacheline_aligned; + +/* + * Used by TCM Core internally to signal if ALUA emulation is enabled or + * disabled, or running in with TCM/pSCSI passthrough mode + */ +typedef enum { + SPC_ALUA_PASSTHROUGH, + SPC2_ALUA_DISABLED, + SPC3_ALUA_EMULATED +} t10_alua_index_t; + +/* + * Used by TCM Core internally to signal if SAM Task Attribute emulation + * is enabled or disabled, or running in with TCM/pSCSI passthrough mode + */ +typedef enum { + SAM_TASK_ATTR_PASSTHROUGH, + SAM_TASK_ATTR_UNTAGGED, + SAM_TASK_ATTR_EMULATED +} t10_task_attr_index_t; + +struct se_cmd; + +struct t10_alua { + t10_alua_index_t alua_type; + /* ALUA Target Port Group ID */ + u16 alua_tg_pt_gps_counter; + u32 alua_tg_pt_gps_count; + spinlock_t tg_pt_gps_lock; + struct se_subsystem_dev *t10_sub_dev; + /* Used for default ALUA Target Port Group */ + struct t10_alua_tg_pt_gp *default_tg_pt_gp; + /* Used for default ALUA Target Port Group ConfigFS group */ + struct config_group alua_tg_pt_gps_group; + int (*alua_state_check)(struct se_cmd *, unsigned char *, u8 *); + struct list_head tg_pt_gps_list; +} ____cacheline_aligned; + +struct t10_alua_lu_gp { + u16 lu_gp_id; + int lu_gp_valid_id; + u32 lu_gp_members; + atomic_t lu_gp_shutdown; + atomic_t lu_gp_ref_cnt; + spinlock_t lu_gp_lock; + struct config_group lu_gp_group; + struct list_head lu_gp_list; + struct list_head lu_gp_mem_list; +} ____cacheline_aligned; + +struct t10_alua_lu_gp_member { + int lu_gp_assoc:1; + atomic_t lu_gp_mem_ref_cnt; + spinlock_t lu_gp_mem_lock; + struct t10_alua_lu_gp *lu_gp; + struct se_device *lu_gp_mem_dev; + struct list_head lu_gp_mem_list; +} ____cacheline_aligned; + +struct t10_alua_tg_pt_gp { + u16 tg_pt_gp_id; + int tg_pt_gp_valid_id; + int tg_pt_gp_alua_access_status; + int tg_pt_gp_alua_access_type; + int tg_pt_gp_nonop_delay_msecs; + int tg_pt_gp_trans_delay_msecs; + int tg_pt_gp_pref; + int tg_pt_gp_write_metadata; + /* Used by struct t10_alua_tg_pt_gp->tg_pt_gp_md_buf_len */ +#define ALUA_MD_BUF_LEN 1024 + u32 tg_pt_gp_md_buf_len; + u32 tg_pt_gp_members; + atomic_t tg_pt_gp_alua_access_state; + atomic_t tg_pt_gp_ref_cnt; + spinlock_t tg_pt_gp_lock; + struct mutex tg_pt_gp_md_mutex; + struct se_subsystem_dev *tg_pt_gp_su_dev; + struct config_group tg_pt_gp_group; + struct list_head tg_pt_gp_list; + struct list_head tg_pt_gp_mem_list; +} ____cacheline_aligned; + +struct t10_alua_tg_pt_gp_member { + int tg_pt_gp_assoc:1; + atomic_t tg_pt_gp_mem_ref_cnt; + spinlock_t tg_pt_gp_mem_lock; + struct t10_alua_tg_pt_gp *tg_pt_gp; + struct se_port *tg_pt; + struct list_head tg_pt_gp_mem_list; +} ____cacheline_aligned; + +struct t10_vpd { + unsigned char device_identifier[INQUIRY_VPD_DEVICE_IDENTIFIER_LEN]; + int protocol_identifier_set; + u32 protocol_identifier; + u32 device_identifier_code_set; + u32 association; + u32 device_identifier_type; + struct list_head vpd_list; +} ____cacheline_aligned; + +struct t10_wwn { + unsigned char vendor[8]; + unsigned char model[16]; + unsigned char revision[4]; + unsigned char unit_serial[INQUIRY_VPD_SERIAL_LEN]; + spinlock_t t10_vpd_lock; + struct se_subsystem_dev *t10_sub_dev; + struct config_group t10_wwn_group; + struct list_head t10_vpd_list; +} ____cacheline_aligned; + + +/* + * Used by TCM Core internally to signal if >= SPC-3 peristent reservations + * emulation is enabled or disabled, or running in with TCM/pSCSI passthrough + * mode + */ +typedef enum { + SPC_PASSTHROUGH, + SPC2_RESERVATIONS, + SPC3_PERSISTENT_RESERVATIONS +} t10_reservations_index_t; + +struct t10_pr_registration { + /* Used for fabrics that contain WWN+ISID */ +#define PR_REG_ISID_LEN 16 + /* PR_REG_ISID_LEN + ',i,0x' */ +#define PR_REG_ISID_ID_LEN (PR_REG_ISID_LEN + 5) + char pr_reg_isid[PR_REG_ISID_LEN]; + /* Used during APTPL metadata reading */ +#define PR_APTPL_MAX_IPORT_LEN 256 + unsigned char pr_iport[PR_APTPL_MAX_IPORT_LEN]; + /* Used during APTPL metadata reading */ +#define PR_APTPL_MAX_TPORT_LEN 256 + unsigned char pr_tport[PR_APTPL_MAX_TPORT_LEN]; + /* For writing out live meta data */ + unsigned char *pr_aptpl_buf; + u16 pr_aptpl_rpti; + u16 pr_reg_tpgt; + /* Reservation effects all target ports */ + int pr_reg_all_tg_pt; + /* Activate Persistence across Target Power Loss */ + int pr_reg_aptpl; + int pr_res_holder; + int pr_res_type; + int pr_res_scope; + /* Used for fabric initiator WWPNs using a ISID */ + int isid_present_at_reg:1; + u32 pr_res_mapped_lun; + u32 pr_aptpl_target_lun; + u32 pr_res_generation; + u64 pr_reg_bin_isid; + u64 pr_res_key; + atomic_t pr_res_holders; + struct se_node_acl *pr_reg_nacl; + struct se_dev_entry *pr_reg_deve; + struct se_lun *pr_reg_tg_pt_lun; + struct list_head pr_reg_list; + struct list_head pr_reg_abort_list; + struct list_head pr_reg_aptpl_list; + struct list_head pr_reg_atp_list; + struct list_head pr_reg_atp_mem_list; +} ____cacheline_aligned; + +/* + * This set of function pointer ops is set based upon SPC3_PERSISTENT_RESERVATIONS, + * SPC2_RESERVATIONS or SPC_PASSTHROUGH in drivers/target/target_core_pr.c: + * core_setup_reservations() + */ +struct t10_reservation_ops { + int (*t10_reservation_check)(struct se_cmd *, u32 *); + int (*t10_seq_non_holder)(struct se_cmd *, unsigned char *, u32); + int (*t10_pr_register)(struct se_cmd *); + int (*t10_pr_clear)(struct se_cmd *); +}; + +struct t10_reservation_template { + /* Reservation effects all target ports */ + int pr_all_tg_pt; + /* Activate Persistence across Target Power Loss enabled + * for SCSI device */ + int pr_aptpl_active; + /* Used by struct t10_reservation_template->pr_aptpl_buf_len */ +#define PR_APTPL_BUF_LEN 8192 + u32 pr_aptpl_buf_len; + u32 pr_generation; + t10_reservations_index_t res_type; + spinlock_t registration_lock; + spinlock_t aptpl_reg_lock; + /* + * This will always be set by one individual I_T Nexus. + * However with all_tg_pt=1, other I_T Nexus from the + * same initiator can access PR reg/res info on a different + * target port. + * + * There is also the 'All Registrants' case, where there is + * a single *pr_res_holder of the reservation, but all + * registrations are considered reservation holders. + */ + struct se_node_acl *pr_res_holder; + struct list_head registration_list; + struct list_head aptpl_reg_list; + struct t10_reservation_ops pr_ops; +} ____cacheline_aligned; + +struct se_queue_req { + int state; + void *cmd; + struct list_head qr_list; +} ____cacheline_aligned; + +struct se_queue_obj { + atomic_t queue_cnt; + spinlock_t cmd_queue_lock; + struct list_head qobj_list; + wait_queue_head_t thread_wq; +} ____cacheline_aligned; + +/* + * Used one per struct se_cmd to hold all extra struct se_task + * metadata. This structure is setup and allocated in + * drivers/target/target_core_transport.c:__transport_alloc_se_cmd() + */ +struct se_transport_task { + unsigned char *t_task_cdb; + unsigned char __t_task_cdb[TCM_MAX_COMMAND_SIZE]; + unsigned long long t_task_lba; + int t_tasks_failed; + int t_tasks_fua; + int t_tasks_bidi:1; + u32 t_task_cdbs; + u32 t_tasks_check; + u32 t_tasks_no; + u32 t_tasks_sectors; + u32 t_tasks_se_num; + u32 t_tasks_se_bidi_num; + u32 t_tasks_sg_chained_no; + atomic_t t_fe_count; + atomic_t t_se_count; + atomic_t t_task_cdbs_left; + atomic_t t_task_cdbs_ex_left; + atomic_t t_task_cdbs_timeout_left; + atomic_t t_task_cdbs_sent; + atomic_t t_transport_aborted; + atomic_t t_transport_active; + atomic_t t_transport_complete; + atomic_t t_transport_queue_active; + atomic_t t_transport_sent; + atomic_t t_transport_stop; + atomic_t t_transport_timeout; + atomic_t transport_dev_active; + atomic_t transport_lun_active; + atomic_t transport_lun_fe_stop; + atomic_t transport_lun_stop; + spinlock_t t_state_lock; + struct completion t_transport_stop_comp; + struct completion transport_lun_fe_stop_comp; + struct completion transport_lun_stop_comp; + struct scatterlist *t_tasks_sg_chained; + struct scatterlist t_tasks_sg_bounce; + void *t_task_buf; + /* + * Used for pre-registered fabric SGL passthrough WRITE and READ + * with the special SCF_PASSTHROUGH_CONTIG_TO_SG case for TCM_Loop + * and other HW target mode fabric modules. + */ + struct scatterlist *t_task_pt_sgl; + struct list_head *t_mem_list; + /* Used for BIDI READ */ + struct list_head *t_mem_bidi_list; + struct list_head t_task_list; +} ____cacheline_aligned; + +struct se_task { + unsigned char task_sense; + struct scatterlist *task_sg; + struct scatterlist *task_sg_bidi; + u8 task_scsi_status; + u8 task_flags; + int task_error_status; + int task_state_flags; + int task_padded_sg:1; + unsigned long long task_lba; + u32 task_no; + u32 task_sectors; + u32 task_size; + u32 task_sg_num; + u32 task_sg_offset; + enum dma_data_direction task_data_direction; + struct se_cmd *task_se_cmd; + struct se_device *se_dev; + struct completion task_stop_comp; + atomic_t task_active; + atomic_t task_execute_queue; + atomic_t task_timeout; + atomic_t task_sent; + atomic_t task_stop; + atomic_t task_state_active; + struct timer_list task_timer; + struct se_device *se_obj_ptr; + struct list_head t_list; + struct list_head t_execute_list; + struct list_head t_state_list; +} ____cacheline_aligned; + +#define TASK_CMD(task) ((struct se_cmd *)task->task_se_cmd) +#define TASK_DEV(task) ((struct se_device *)task->se_dev) + +struct se_cmd { + /* SAM response code being sent to initiator */ + u8 scsi_status; + u8 scsi_asc; + u8 scsi_ascq; + u8 scsi_sense_reason; + u16 scsi_sense_length; + /* Delay for ALUA Active/NonOptimized state access in milliseconds */ + int alua_nonop_delay; + /* See include/linux/dma-mapping.h */ + enum dma_data_direction data_direction; + /* For SAM Task Attribute */ + int sam_task_attr; + /* Transport protocol dependent state, see transport_state_table */ + enum transport_state_table t_state; + /* Transport protocol dependent state for out of order CmdSNs */ + int deferred_t_state; + /* Transport specific error status */ + int transport_error_status; + /* See se_cmd_flags_table */ + u32 se_cmd_flags; + u32 se_ordered_id; + /* Total size in bytes associated with command */ + u32 data_length; + /* SCSI Presented Data Transfer Length */ + u32 cmd_spdtl; + u32 residual_count; + u32 orig_fe_lun; + /* Persistent Reservation key */ + u64 pr_res_key; + atomic_t transport_sent; + /* Used for sense data */ + void *sense_buffer; + struct list_head se_delayed_list; + struct list_head se_ordered_list; + struct list_head se_lun_list; + struct se_device *se_dev; + struct se_dev_entry *se_deve; + struct se_device *se_obj_ptr; + struct se_device *se_orig_obj_ptr; + struct se_lun *se_lun; + /* Only used for internal passthrough and legacy TCM fabric modules */ + struct se_session *se_sess; + struct se_tmr_req *se_tmr_req; + /* t_task is setup to t_task_backstore in transport_init_se_cmd() */ + struct se_transport_task *t_task; + struct se_transport_task t_task_backstore; + struct target_core_fabric_ops *se_tfo; + int (*transport_emulate_cdb)(struct se_cmd *); + void (*transport_split_cdb)(unsigned long long, u32 *, unsigned char *); + void (*transport_wait_for_tasks)(struct se_cmd *, int, int); + void (*transport_complete_callback)(struct se_cmd *); +} ____cacheline_aligned; + +#define T_TASK(cmd) ((struct se_transport_task *)(cmd->t_task)) +#define CMD_TFO(cmd) ((struct target_core_fabric_ops *)cmd->se_tfo) + +struct se_tmr_req { + /* Task Management function to be preformed */ + u8 function; + /* Task Management response to send */ + u8 response; + int call_transport; + /* Reference to ITT that Task Mgmt should be preformed */ + u32 ref_task_tag; + /* 64-bit encoded SAM LUN from $FABRIC_MOD TMR header */ + u64 ref_task_lun; + void *fabric_tmr_ptr; + struct se_cmd *task_cmd; + struct se_cmd *ref_cmd; + struct se_device *tmr_dev; + struct se_lun *tmr_lun; + struct list_head tmr_list; +} ____cacheline_aligned; + +struct se_ua { + u8 ua_asc; + u8 ua_ascq; + struct se_node_acl *ua_nacl; + struct list_head ua_dev_list; + struct list_head ua_nacl_list; +} ____cacheline_aligned; + +struct se_node_acl { + char initiatorname[TRANSPORT_IQN_LEN]; + /* Used to signal demo mode created ACL, disabled by default */ + int dynamic_node_acl:1; + u32 queue_depth; + u32 acl_index; + u64 num_cmds; + u64 read_bytes; + u64 write_bytes; + spinlock_t stats_lock; + /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */ + atomic_t acl_pr_ref_count; + /* Used for MIB access */ + atomic_t mib_ref_count; + struct se_dev_entry *device_list; + struct se_session *nacl_sess; + struct se_portal_group *se_tpg; + spinlock_t device_list_lock; + spinlock_t nacl_sess_lock; + struct config_group acl_group; + struct config_group acl_attrib_group; + struct config_group acl_auth_group; + struct config_group acl_param_group; + struct config_group *acl_default_groups[4]; + struct list_head acl_list; + struct list_head acl_sess_list; +} ____cacheline_aligned; + +struct se_session { + /* Used for MIB access */ + atomic_t mib_ref_count; + u64 sess_bin_isid; + struct se_node_acl *se_node_acl; + struct se_portal_group *se_tpg; + void *fabric_sess_ptr; + struct list_head sess_list; + struct list_head sess_acl_list; +} ____cacheline_aligned; + +#define SE_SESS(cmd) ((struct se_session *)(cmd)->se_sess) +#define SE_NODE_ACL(sess) ((struct se_node_acl *)(sess)->se_node_acl) + +struct se_device; +struct se_transform_info; +struct scatterlist; + +struct se_lun_acl { + char initiatorname[TRANSPORT_IQN_LEN]; + u32 mapped_lun; + struct se_node_acl *se_lun_nacl; + struct se_lun *se_lun; + struct list_head lacl_list; + struct config_group se_lun_group; +} ____cacheline_aligned; + +struct se_dev_entry { + int def_pr_registered:1; + /* See transport_lunflags_table */ + u32 lun_flags; + u32 deve_cmds; + u32 mapped_lun; + u32 average_bytes; + u32 last_byte_count; + u32 total_cmds; + u32 total_bytes; + u64 pr_res_key; + u64 creation_time; + u32 attach_count; + u64 read_bytes; + u64 write_bytes; + atomic_t ua_count; + /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */ + atomic_t pr_ref_count; + struct se_lun_acl *se_lun_acl; + spinlock_t ua_lock; + struct se_lun *se_lun; + struct list_head alua_port_list; + struct list_head ua_list; +} ____cacheline_aligned; + +struct se_dev_limits { + /* Max supported HW queue depth */ + u32 hw_queue_depth; + /* Max supported virtual queue depth */ + u32 queue_depth; + /* From include/linux/blkdev.h for the other HW/SW limits. */ + struct queue_limits limits; +} ____cacheline_aligned; + +struct se_dev_attrib { + int emulate_dpo; + int emulate_fua_write; + int emulate_fua_read; + int emulate_write_cache; + int emulate_ua_intlck_ctrl; + int emulate_tas; + int emulate_tpu; + int emulate_tpws; + int emulate_reservations; + int emulate_alua; + int enforce_pr_isids; + u32 hw_block_size; + u32 block_size; + u32 hw_max_sectors; + u32 max_sectors; + u32 optimal_sectors; + u32 hw_queue_depth; + u32 queue_depth; + u32 task_timeout; + u32 max_unmap_lba_count; + u32 max_unmap_block_desc_count; + u32 unmap_granularity; + u32 unmap_granularity_alignment; + struct se_subsystem_dev *da_sub_dev; + struct config_group da_group; +} ____cacheline_aligned; + +struct se_subsystem_dev { +/* Used for struct se_subsystem_dev-->se_dev_alias, must be less than PAGE_SIZE */ +#define SE_DEV_ALIAS_LEN 512 + unsigned char se_dev_alias[SE_DEV_ALIAS_LEN]; +/* Used for struct se_subsystem_dev->se_dev_udev_path[], must be less than PAGE_SIZE */ +#define SE_UDEV_PATH_LEN 512 + unsigned char se_dev_udev_path[SE_UDEV_PATH_LEN]; + u32 su_dev_flags; + struct se_hba *se_dev_hba; + struct se_device *se_dev_ptr; + struct se_dev_attrib se_dev_attrib; + /* T10 Asymmetric Logical Unit Assignment for Target Ports */ + struct t10_alua t10_alua; + /* T10 Inquiry and VPD WWN Information */ + struct t10_wwn t10_wwn; + /* T10 SPC-2 + SPC-3 Reservations */ + struct t10_reservation_template t10_reservation; + spinlock_t se_dev_lock; + void *se_dev_su_ptr; + struct list_head g_se_dev_list; + struct config_group se_dev_group; + /* For T10 Reservations */ + struct config_group se_dev_pr_group; +} ____cacheline_aligned; + +#define T10_ALUA(su_dev) (&(su_dev)->t10_alua) +#define T10_RES(su_dev) (&(su_dev)->t10_reservation) +#define T10_PR_OPS(su_dev) (&(su_dev)->t10_reservation.pr_ops) + +struct se_device { + /* Set to 1 if thread is NOT sleeping on thread_sem */ + u8 thread_active; + u8 dev_status_timer_flags; + /* RELATIVE TARGET PORT IDENTIFER Counter */ + u16 dev_rpti_counter; + /* Used for SAM Task Attribute ordering */ + u32 dev_cur_ordered_id; + u32 dev_flags; + u32 dev_port_count; + /* See transport_device_status_table */ + u32 dev_status; + u32 dev_tcq_window_closed; + /* Physical device queue depth */ + u32 queue_depth; + /* Used for SPC-2 reservations enforce of ISIDs */ + u64 dev_res_bin_isid; + t10_task_attr_index_t dev_task_attr_type; + /* Pointer to transport specific device structure */ + void *dev_ptr; + u32 dev_index; + u64 creation_time; + u32 num_resets; + u64 num_cmds; + u64 read_bytes; + u64 write_bytes; + spinlock_t stats_lock; + /* Active commands on this virtual SE device */ + atomic_t active_cmds; + atomic_t simple_cmds; + atomic_t depth_left; + atomic_t dev_ordered_id; + atomic_t dev_tur_active; + atomic_t execute_tasks; + atomic_t dev_status_thr_count; + atomic_t dev_hoq_count; + atomic_t dev_ordered_sync; + struct se_obj dev_obj; + struct se_obj dev_access_obj; + struct se_obj dev_export_obj; + struct se_queue_obj *dev_queue_obj; + struct se_queue_obj *dev_status_queue_obj; + spinlock_t delayed_cmd_lock; + spinlock_t ordered_cmd_lock; + spinlock_t execute_task_lock; + spinlock_t state_task_lock; + spinlock_t dev_alua_lock; + spinlock_t dev_reservation_lock; + spinlock_t dev_state_lock; + spinlock_t dev_status_lock; + spinlock_t dev_status_thr_lock; + spinlock_t se_port_lock; + spinlock_t se_tmr_lock; + /* Used for legacy SPC-2 reservationsa */ + struct se_node_acl *dev_reserved_node_acl; + /* Used for ALUA Logical Unit Group membership */ + struct t10_alua_lu_gp_member *dev_alua_lu_gp_mem; + /* Used for SPC-3 Persistent Reservations */ + struct t10_pr_registration *dev_pr_res_holder; + struct list_head dev_sep_list; + struct list_head dev_tmr_list; + struct timer_list dev_status_timer; + /* Pointer to descriptor for processing thread */ + struct task_struct *process_thread; + pid_t process_thread_pid; + struct task_struct *dev_mgmt_thread; + struct list_head delayed_cmd_list; + struct list_head ordered_cmd_list; + struct list_head execute_task_list; + struct list_head state_task_list; + /* Pointer to associated SE HBA */ + struct se_hba *se_hba; + struct se_subsystem_dev *se_sub_dev; + /* Pointer to template of function pointers for transport */ + struct se_subsystem_api *transport; + /* Linked list for struct se_hba struct se_device list */ + struct list_head dev_list; + /* Linked list for struct se_global->g_se_dev_list */ + struct list_head g_se_dev_list; +} ____cacheline_aligned; + +#define SE_DEV(cmd) ((struct se_device *)(cmd)->se_lun->lun_se_dev) +#define SU_DEV(dev) ((struct se_subsystem_dev *)(dev)->se_sub_dev) +#define DEV_ATTRIB(dev) (&(dev)->se_sub_dev->se_dev_attrib) +#define DEV_T10_WWN(dev) (&(dev)->se_sub_dev->t10_wwn) + +struct se_hba { + u16 hba_tpgt; + u32 hba_id; + /* See hba_flags_table */ + u32 hba_flags; + /* Virtual iSCSI devices attached. */ + u32 dev_count; + u32 hba_index; + atomic_t dev_mib_access_count; + atomic_t load_balance_queue; + atomic_t left_queue_depth; + /* Maximum queue depth the HBA can handle. */ + atomic_t max_queue_depth; + /* Pointer to transport specific host structure. */ + void *hba_ptr; + /* Linked list for struct se_device */ + struct list_head hba_dev_list; + struct list_head hba_list; + spinlock_t device_lock; + spinlock_t hba_queue_lock; + struct config_group hba_group; + struct mutex hba_access_mutex; + struct se_subsystem_api *transport; +} ____cacheline_aligned; + +#define SE_HBA(d) ((struct se_hba *)(d)->se_hba) + +struct se_lun { + /* See transport_lun_status_table */ + enum transport_lun_status_table lun_status; + u32 lun_access; + u32 lun_flags; + u32 unpacked_lun; + atomic_t lun_acl_count; + spinlock_t lun_acl_lock; + spinlock_t lun_cmd_lock; + spinlock_t lun_sep_lock; + struct completion lun_shutdown_comp; + struct list_head lun_cmd_list; + struct list_head lun_acl_list; + struct se_device *lun_se_dev; + struct config_group lun_group; + struct se_port *lun_sep; +} ____cacheline_aligned; + +#define SE_LUN(c) ((struct se_lun *)(c)->se_lun) + +struct se_port { + /* RELATIVE TARGET PORT IDENTIFER */ + u16 sep_rtpi; + int sep_tg_pt_secondary_stat; + int sep_tg_pt_secondary_write_md; + u32 sep_index; + struct scsi_port_stats sep_stats; + /* Used for ALUA Target Port Groups membership */ + atomic_t sep_tg_pt_gp_active; + atomic_t sep_tg_pt_secondary_offline; + /* Used for PR ALL_TG_PT=1 */ + atomic_t sep_tg_pt_ref_cnt; + spinlock_t sep_alua_lock; + struct mutex sep_tg_pt_md_mutex; + struct t10_alua_tg_pt_gp_member *sep_alua_tg_pt_gp_mem; + struct se_lun *sep_lun; + struct se_portal_group *sep_tpg; + struct list_head sep_alua_list; + struct list_head sep_list; +} ____cacheline_aligned; + +struct se_tpg_np { + struct config_group tpg_np_group; +} ____cacheline_aligned; + +struct se_portal_group { + /* Type of target portal group, see transport_tpg_type_table */ + enum transport_tpg_type_table se_tpg_type; + /* Number of ACLed Initiator Nodes for this TPG */ + u32 num_node_acls; + /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */ + atomic_t tpg_pr_ref_count; + /* Spinlock for adding/removing ACLed Nodes */ + spinlock_t acl_node_lock; + /* Spinlock for adding/removing sessions */ + spinlock_t session_lock; + spinlock_t tpg_lun_lock; + /* Pointer to $FABRIC_MOD portal group */ + void *se_tpg_fabric_ptr; + struct list_head se_tpg_list; + /* linked list for initiator ACL list */ + struct list_head acl_node_list; + struct se_lun *tpg_lun_list; + struct se_lun tpg_virt_lun0; + /* List of TCM sessions assoicated wth this TPG */ + struct list_head tpg_sess_list; + /* Pointer to $FABRIC_MOD dependent code */ + struct target_core_fabric_ops *se_tpg_tfo; + struct se_wwn *se_tpg_wwn; + struct config_group tpg_group; + struct config_group *tpg_default_groups[6]; + struct config_group tpg_lun_group; + struct config_group tpg_np_group; + struct config_group tpg_acl_group; + struct config_group tpg_attrib_group; + struct config_group tpg_param_group; +} ____cacheline_aligned; + +#define TPG_TFO(se_tpg) ((struct target_core_fabric_ops *)(se_tpg)->se_tpg_tfo) + +struct se_wwn { + struct target_fabric_configfs *wwn_tf; + struct config_group wwn_group; +} ____cacheline_aligned; + +struct se_global { + u16 alua_lu_gps_counter; + int g_sub_api_initialized; + u32 in_shutdown; + u32 alua_lu_gps_count; + u32 g_hba_id_counter; + struct config_group target_core_hbagroup; + struct config_group alua_group; + struct config_group alua_lu_gps_group; + struct list_head g_lu_gps_list; + struct list_head g_se_tpg_list; + struct list_head g_hba_list; + struct list_head g_se_dev_list; + struct se_hba *g_lun0_hba; + struct se_subsystem_dev *g_lun0_su_dev; + struct se_device *g_lun0_dev; + struct t10_alua_lu_gp *default_lu_gp; + spinlock_t g_device_lock; + spinlock_t hba_lock; + spinlock_t se_tpg_lock; + spinlock_t lu_gps_lock; + spinlock_t plugin_class_lock; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_BASE_H */ diff --git a/include/target/target_core_configfs.h b/include/target/target_core_configfs.h new file mode 100644 index 0000000..40e6e74 --- /dev/null +++ b/include/target/target_core_configfs.h @@ -0,0 +1,52 @@ +#define TARGET_CORE_CONFIGFS_VERSION TARGET_CORE_MOD_VERSION + +#define TARGET_CORE_CONFIG_ROOT "/sys/kernel/config" + +#define TARGET_CORE_NAME_MAX_LEN 64 +#define TARGET_FABRIC_NAME_SIZE 32 + +extern struct target_fabric_configfs *target_fabric_configfs_init( + struct module *, const char *); +extern void target_fabric_configfs_free(struct target_fabric_configfs *); +extern int target_fabric_configfs_register(struct target_fabric_configfs *); +extern void target_fabric_configfs_deregister(struct target_fabric_configfs *); + +struct target_fabric_configfs_template { + struct config_item_type tfc_discovery_cit; + struct config_item_type tfc_wwn_cit; + struct config_item_type tfc_tpg_cit; + struct config_item_type tfc_tpg_base_cit; + struct config_item_type tfc_tpg_lun_cit; + struct config_item_type tfc_tpg_port_cit; + struct config_item_type tfc_tpg_np_cit; + struct config_item_type tfc_tpg_np_base_cit; + struct config_item_type tfc_tpg_attrib_cit; + struct config_item_type tfc_tpg_param_cit; + struct config_item_type tfc_tpg_nacl_cit; + struct config_item_type tfc_tpg_nacl_base_cit; + struct config_item_type tfc_tpg_nacl_attrib_cit; + struct config_item_type tfc_tpg_nacl_auth_cit; + struct config_item_type tfc_tpg_nacl_param_cit; + struct config_item_type tfc_tpg_mappedlun_cit; +}; + +struct target_fabric_configfs { + char tf_name[TARGET_FABRIC_NAME_SIZE]; + atomic_t tf_access_cnt; + struct list_head tf_list; + struct config_group tf_group; + struct config_group tf_disc_group; + struct config_group *tf_default_groups[2]; + /* Pointer to fabric's config_item */ + struct config_item *tf_fabric; + /* Passed from fabric modules */ + struct config_item_type *tf_fabric_cit; + /* Pointer to target core subsystem */ + struct configfs_subsystem *tf_subsys; + /* Pointer to fabric's struct module */ + struct module *tf_module; + struct target_core_fabric_ops tf_ops; + struct target_fabric_configfs_template tf_cit_tmpl; +}; + +#define TF_CIT_TMPL(tf) (&(tf)->tf_cit_tmpl) diff --git a/include/target/target_core_device.h b/include/target/target_core_device.h new file mode 100644 index 0000000..52b18a5 --- /dev/null +++ b/include/target/target_core_device.h @@ -0,0 +1,61 @@ +#ifndef TARGET_CORE_DEVICE_H +#define TARGET_CORE_DEVICE_H + +extern int transport_get_lun_for_cmd(struct se_cmd *, unsigned char *, u32); +extern int transport_get_lun_for_tmr(struct se_cmd *, u32); +extern struct se_dev_entry *core_get_se_deve_from_rtpi( + struct se_node_acl *, u16); +extern int core_free_device_list_for_node(struct se_node_acl *, + struct se_portal_group *); +extern void core_dec_lacl_count(struct se_node_acl *, struct se_cmd *); +extern void core_update_device_list_access(u32, u32, struct se_node_acl *); +extern int core_update_device_list_for_node(struct se_lun *, struct se_lun_acl *, u32, + u32, struct se_node_acl *, + struct se_portal_group *, int); +extern void core_clear_lun_from_tpg(struct se_lun *, struct se_portal_group *); +extern int core_dev_export(struct se_device *, struct se_portal_group *, + struct se_lun *); +extern void core_dev_unexport(struct se_device *, struct se_portal_group *, + struct se_lun *); +extern int transport_core_report_lun_response(struct se_cmd *); +extern void se_release_device_for_hba(struct se_device *); +extern void se_release_vpd_for_dev(struct se_device *); +extern void se_clear_dev_ports(struct se_device *); +extern int se_free_virtual_device(struct se_device *, struct se_hba *); +extern int se_dev_check_online(struct se_device *); +extern int se_dev_check_shutdown(struct se_device *); +extern void se_dev_set_default_attribs(struct se_device *, struct se_dev_limits *); +extern int se_dev_set_task_timeout(struct se_device *, u32); +extern int se_dev_set_max_unmap_lba_count(struct se_device *, u32); +extern int se_dev_set_max_unmap_block_desc_count(struct se_device *, u32); +extern int se_dev_set_unmap_granularity(struct se_device *, u32); +extern int se_dev_set_unmap_granularity_alignment(struct se_device *, u32); +extern int se_dev_set_emulate_dpo(struct se_device *, int); +extern int se_dev_set_emulate_fua_write(struct se_device *, int); +extern int se_dev_set_emulate_fua_read(struct se_device *, int); +extern int se_dev_set_emulate_write_cache(struct se_device *, int); +extern int se_dev_set_emulate_ua_intlck_ctrl(struct se_device *, int); +extern int se_dev_set_emulate_tas(struct se_device *, int); +extern int se_dev_set_emulate_tpu(struct se_device *, int); +extern int se_dev_set_emulate_tpws(struct se_device *, int); +extern int se_dev_set_enforce_pr_isids(struct se_device *, int); +extern int se_dev_set_queue_depth(struct se_device *, u32); +extern int se_dev_set_max_sectors(struct se_device *, u32); +extern int se_dev_set_optimal_sectors(struct se_device *, u32); +extern int se_dev_set_block_size(struct se_device *, u32); +extern struct se_lun *core_dev_add_lun(struct se_portal_group *, struct se_hba *, + struct se_device *, u32); +extern int core_dev_del_lun(struct se_portal_group *, u32); +extern struct se_lun *core_get_lun_from_tpg(struct se_portal_group *, u32); +extern struct se_lun_acl *core_dev_init_initiator_node_lun_acl(struct se_portal_group *, + u32, char *, int *); +extern int core_dev_add_initiator_node_lun_acl(struct se_portal_group *, + struct se_lun_acl *, u32, u32); +extern int core_dev_del_initiator_node_lun_acl(struct se_portal_group *, + struct se_lun *, struct se_lun_acl *); +extern void core_dev_free_initiator_node_lun_acl(struct se_portal_group *, + struct se_lun_acl *lacl); +extern int core_dev_setup_virtual_lun0(void); +extern void core_dev_release_virtual_lun0(void); + +#endif /* TARGET_CORE_DEVICE_H */ diff --git a/include/target/target_core_fabric_configfs.h b/include/target/target_core_fabric_configfs.h new file mode 100644 index 0000000..a26fb75 --- /dev/null +++ b/include/target/target_core_fabric_configfs.h @@ -0,0 +1,106 @@ +/* + * Used for tfc_wwn_cit attributes + */ + +#include + +CONFIGFS_EATTR_STRUCT(target_fabric_nacl_attrib, se_node_acl); +#define TF_NACL_ATTRIB_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_nacl_attrib_attribute _fabric##_nacl_attrib_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_nacl_attrib_show_##_name, \ + _fabric##_nacl_attrib_store_##_name); + +CONFIGFS_EATTR_STRUCT(target_fabric_nacl_auth, se_node_acl); +#define TF_NACL_AUTH_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_nacl_auth_attribute _fabric##_nacl_auth_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_nacl_auth_show_##_name, \ + _fabric##_nacl_auth_store_##_name); + +#define TF_NACL_AUTH_ATTR_RO(_fabric, _name) \ +static struct target_fabric_nacl_auth_attribute _fabric##_nacl_auth_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_nacl_auth_show_##_name); + +CONFIGFS_EATTR_STRUCT(target_fabric_nacl_param, se_node_acl); +#define TF_NACL_PARAM_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_nacl_param_attribute _fabric##_nacl_param_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_nacl_param_show_##_name, \ + _fabric##_nacl_param_store_##_name); + +#define TF_NACL_PARAM_ATTR_RO(_fabric, _name) \ +static struct target_fabric_nacl_param_attribute _fabric##_nacl_param_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_nacl_param_show_##_name); + + +CONFIGFS_EATTR_STRUCT(target_fabric_nacl_base, se_node_acl); +#define TF_NACL_BASE_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_nacl_base_attribute _fabric##_nacl_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_nacl_show_##_name, \ + _fabric##_nacl_store_##_name); + +#define TF_NACL_BASE_ATTR_RO(_fabric, _name) \ +static struct target_fabric_nacl_base_attribute _fabric##_nacl_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_nacl_show_##_name); + +CONFIGFS_EATTR_STRUCT(target_fabric_np_base, se_tpg_np); +#define TF_NP_BASE_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_np_base_attribute _fabric##_np_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_np_show_##_name, \ + _fabric##_np_store_##_name); + +CONFIGFS_EATTR_STRUCT(target_fabric_tpg_attrib, se_portal_group); +#define TF_TPG_ATTRIB_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_tpg_attrib_attribute _fabric##_tpg_attrib_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_tpg_attrib_show_##_name, \ + _fabric##_tpg_attrib_store_##_name); + + +CONFIGFS_EATTR_STRUCT(target_fabric_tpg_param, se_portal_group); +#define TF_TPG_PARAM_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_tpg_param_attribute _fabric##_tpg_param_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_tpg_param_show_##_name, \ + _fabric##_tpg_param_store_##_name); + + +CONFIGFS_EATTR_STRUCT(target_fabric_tpg, se_portal_group); +#define TF_TPG_BASE_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_tpg_attribute _fabric##_tpg_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_tpg_show_##_name, \ + _fabric##_tpg_store_##_name); + + +CONFIGFS_EATTR_STRUCT(target_fabric_wwn, target_fabric_configfs); +#define TF_WWN_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_wwn_attribute _fabric##_wwn_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_wwn_show_attr_##_name, \ + _fabric##_wwn_store_attr_##_name); + +#define TF_WWN_ATTR_RO(_fabric, _name) \ +static struct target_fabric_wwn_attribute _fabric##_wwn_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_wwn_show_attr_##_name); + +CONFIGFS_EATTR_STRUCT(target_fabric_discovery, target_fabric_configfs); +#define TF_DISC_ATTR(_fabric, _name, _mode) \ +static struct target_fabric_discovery_attribute _fabric##_disc_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + _fabric##_disc_show_##_name, \ + _fabric##_disc_store_##_name); + +#define TF_DISC_ATTR_RO(_fabric, _name) \ +static struct target_fabric_discovery_attribute _fabric##_disc_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_disc_show_##_name); + +extern int target_fabric_setup_cits(struct target_fabric_configfs *); diff --git a/include/target/target_core_fabric_lib.h b/include/target/target_core_fabric_lib.h new file mode 100644 index 0000000..c2f8d0e --- /dev/null +++ b/include/target/target_core_fabric_lib.h @@ -0,0 +1,28 @@ +#ifndef TARGET_CORE_FABRIC_LIB_H +#define TARGET_CORE_FABRIC_LIB_H + +extern u8 sas_get_fabric_proto_ident(struct se_portal_group *); +extern u32 sas_get_pr_transport_id(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *, unsigned char *); +extern u32 sas_get_pr_transport_id_len(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *); +extern char *sas_parse_pr_out_transport_id(struct se_portal_group *, + const char *, u32 *, char **); + +extern u8 fc_get_fabric_proto_ident(struct se_portal_group *); +extern u32 fc_get_pr_transport_id(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *, unsigned char *); +extern u32 fc_get_pr_transport_id_len(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *); +extern char *fc_parse_pr_out_transport_id(struct se_portal_group *, + const char *, u32 *, char **); + +extern u8 iscsi_get_fabric_proto_ident(struct se_portal_group *); +extern u32 iscsi_get_pr_transport_id(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *, unsigned char *); +extern u32 iscsi_get_pr_transport_id_len(struct se_portal_group *, struct se_node_acl *, + struct t10_pr_registration *, int *); +extern char *iscsi_parse_pr_out_transport_id(struct se_portal_group *, + const char *, u32 *, char **); + +#endif /* TARGET_CORE_FABRIC_LIB_H */ diff --git a/include/target/target_core_fabric_ops.h b/include/target/target_core_fabric_ops.h new file mode 100644 index 0000000..f3ac12b --- /dev/null +++ b/include/target/target_core_fabric_ops.h @@ -0,0 +1,100 @@ +/* Defined in target_core_configfs.h */ +struct target_fabric_configfs; + +struct target_core_fabric_ops { + struct configfs_subsystem *tf_subsys; + /* + * Optional to signal struct se_task->task_sg[] padding entries + * for scatterlist chaining using transport_do_task_sg_link(), + * disabled by default + */ + int task_sg_chaining:1; + char *(*get_fabric_name)(void); + u8 (*get_fabric_proto_ident)(struct se_portal_group *); + char *(*tpg_get_wwn)(struct se_portal_group *); + u16 (*tpg_get_tag)(struct se_portal_group *); + u32 (*tpg_get_default_depth)(struct se_portal_group *); + u32 (*tpg_get_pr_transport_id)(struct se_portal_group *, + struct se_node_acl *, + struct t10_pr_registration *, int *, + unsigned char *); + u32 (*tpg_get_pr_transport_id_len)(struct se_portal_group *, + struct se_node_acl *, + struct t10_pr_registration *, int *); + char *(*tpg_parse_pr_out_transport_id)(struct se_portal_group *, + const char *, u32 *, char **); + int (*tpg_check_demo_mode)(struct se_portal_group *); + int (*tpg_check_demo_mode_cache)(struct se_portal_group *); + int (*tpg_check_demo_mode_write_protect)(struct se_portal_group *); + int (*tpg_check_prod_mode_write_protect)(struct se_portal_group *); + struct se_node_acl *(*tpg_alloc_fabric_acl)( + struct se_portal_group *); + void (*tpg_release_fabric_acl)(struct se_portal_group *, + struct se_node_acl *); + u32 (*tpg_get_inst_index)(struct se_portal_group *); + /* + * Optional function pointer for TCM to perform command map + * from TCM processing thread context, for those struct se_cmd + * initally allocated in interrupt context. + */ + int (*new_cmd_map)(struct se_cmd *); + /* + * Optional function pointer for TCM fabric modules that use + * Linux/NET sockets to allocate struct iovec array to struct se_cmd + */ + int (*alloc_cmd_iovecs)(struct se_cmd *); + /* + * Optional to release struct se_cmd and fabric dependent allocated + * I/O descriptor in transport_cmd_check_stop() + */ + void (*check_stop_free)(struct se_cmd *); + void (*release_cmd_to_pool)(struct se_cmd *); + void (*release_cmd_direct)(struct se_cmd *); + /* + * Called with spin_lock_bh(struct se_portal_group->session_lock held. + */ + int (*shutdown_session)(struct se_session *); + void (*close_session)(struct se_session *); + void (*stop_session)(struct se_session *, int, int); + void (*fall_back_to_erl0)(struct se_session *); + int (*sess_logged_in)(struct se_session *); + u32 (*sess_get_index)(struct se_session *); + /* + * Used only for SCSI fabrics that contain multi-value TransportIDs + * (like iSCSI). All other SCSI fabrics should set this to NULL. + */ + u32 (*sess_get_initiator_sid)(struct se_session *, + unsigned char *, u32); + int (*write_pending)(struct se_cmd *); + int (*write_pending_status)(struct se_cmd *); + void (*set_default_node_attributes)(struct se_node_acl *); + u32 (*get_task_tag)(struct se_cmd *); + int (*get_cmd_state)(struct se_cmd *); + void (*new_cmd_failure)(struct se_cmd *); + int (*queue_data_in)(struct se_cmd *); + int (*queue_status)(struct se_cmd *); + int (*queue_tm_rsp)(struct se_cmd *); + u16 (*set_fabric_sense_len)(struct se_cmd *, u32); + u16 (*get_fabric_sense_len)(void); + int (*is_state_remove)(struct se_cmd *); + u64 (*pack_lun)(unsigned int); + /* + * fabric module calls for target_core_fabric_configfs.c + */ + struct se_wwn *(*fabric_make_wwn)(struct target_fabric_configfs *, + struct config_group *, const char *); + void (*fabric_drop_wwn)(struct se_wwn *); + struct se_portal_group *(*fabric_make_tpg)(struct se_wwn *, + struct config_group *, const char *); + void (*fabric_drop_tpg)(struct se_portal_group *); + int (*fabric_post_link)(struct se_portal_group *, + struct se_lun *); + void (*fabric_pre_unlink)(struct se_portal_group *, + struct se_lun *); + struct se_tpg_np *(*fabric_make_np)(struct se_portal_group *, + struct config_group *, const char *); + void (*fabric_drop_np)(struct se_tpg_np *); + struct se_node_acl *(*fabric_make_nodeacl)(struct se_portal_group *, + struct config_group *, const char *); + void (*fabric_drop_nodeacl)(struct se_node_acl *); +}; diff --git a/include/target/target_core_tmr.h b/include/target/target_core_tmr.h new file mode 100644 index 0000000..6c8248b --- /dev/null +++ b/include/target/target_core_tmr.h @@ -0,0 +1,43 @@ +#ifndef TARGET_CORE_TMR_H +#define TARGET_CORE_TMR_H + +/* task management function values */ +#ifdef ABORT_TASK +#undef ABORT_TASK +#endif /* ABORT_TASK */ +#define ABORT_TASK 1 +#ifdef ABORT_TASK_SET +#undef ABORT_TASK_SET +#endif /* ABORT_TASK_SET */ +#define ABORT_TASK_SET 2 +#ifdef CLEAR_ACA +#undef CLEAR_ACA +#endif /* CLEAR_ACA */ +#define CLEAR_ACA 3 +#ifdef CLEAR_TASK_SET +#undef CLEAR_TASK_SET +#endif /* CLEAR_TASK_SET */ +#define CLEAR_TASK_SET 4 +#define LUN_RESET 5 +#define TARGET_WARM_RESET 6 +#define TARGET_COLD_RESET 7 +#define TASK_REASSIGN 8 + +/* task management response values */ +#define TMR_FUNCTION_COMPLETE 0 +#define TMR_TASK_DOES_NOT_EXIST 1 +#define TMR_LUN_DOES_NOT_EXIST 2 +#define TMR_TASK_STILL_ALLEGIANT 3 +#define TMR_TASK_FAILOVER_NOT_SUPPORTED 4 +#define TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED 5 +#define TMR_FUNCTION_AUTHORIZATION_FAILED 6 +#define TMR_FUNCTION_REJECTED 255 + +extern struct kmem_cache *se_tmr_req_cache; + +extern struct se_tmr_req *core_tmr_alloc_req(struct se_cmd *, void *, u8); +extern void core_tmr_release_req(struct se_tmr_req *); +extern int core_tmr_lun_reset(struct se_device *, struct se_tmr_req *, + struct list_head *, struct se_cmd *); + +#endif /* TARGET_CORE_TMR_H */ diff --git a/include/target/target_core_tpg.h b/include/target/target_core_tpg.h new file mode 100644 index 0000000..77e1872 --- /dev/null +++ b/include/target/target_core_tpg.h @@ -0,0 +1,35 @@ +#ifndef TARGET_CORE_TPG_H +#define TARGET_CORE_TPG_H + +extern struct se_node_acl *__core_tpg_get_initiator_node_acl(struct se_portal_group *tpg, + const char *); +extern struct se_node_acl *core_tpg_get_initiator_node_acl(struct se_portal_group *tpg, + unsigned char *); +extern void core_tpg_add_node_to_devs(struct se_node_acl *, + struct se_portal_group *); +extern struct se_node_acl *core_tpg_check_initiator_node_acl( + struct se_portal_group *, + unsigned char *); +extern void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *); +extern void core_tpg_wait_for_mib_ref(struct se_node_acl *); +extern void core_tpg_clear_object_luns(struct se_portal_group *); +extern struct se_node_acl *core_tpg_add_initiator_node_acl( + struct se_portal_group *, + struct se_node_acl *, + const char *, u32); +extern int core_tpg_del_initiator_node_acl(struct se_portal_group *, + struct se_node_acl *, int); +extern int core_tpg_set_initiator_node_queue_depth(struct se_portal_group *, + unsigned char *, u32, int); +extern int core_tpg_register(struct target_core_fabric_ops *, + struct se_wwn *, + struct se_portal_group *, void *, + int); +extern int core_tpg_deregister(struct se_portal_group *); +extern struct se_lun *core_tpg_pre_addlun(struct se_portal_group *, u32); +extern int core_tpg_post_addlun(struct se_portal_group *, struct se_lun *, u32, + void *); +extern struct se_lun *core_tpg_pre_dellun(struct se_portal_group *, u32, int *); +extern int core_tpg_post_dellun(struct se_portal_group *, struct se_lun *); + +#endif /* TARGET_CORE_TPG_H */ diff --git a/include/target/target_core_transport.h b/include/target/target_core_transport.h new file mode 100644 index 0000000..66f44e5 --- /dev/null +++ b/include/target/target_core_transport.h @@ -0,0 +1,351 @@ +#ifndef TARGET_CORE_TRANSPORT_H +#define TARGET_CORE_TRANSPORT_H + +#define TARGET_CORE_VERSION TARGET_CORE_MOD_VERSION + +/* Attempts before moving from SHORT to LONG */ +#define PYX_TRANSPORT_WINDOW_CLOSED_THRESHOLD 3 +#define PYX_TRANSPORT_WINDOW_CLOSED_WAIT_SHORT 3 /* In milliseconds */ +#define PYX_TRANSPORT_WINDOW_CLOSED_WAIT_LONG 10 /* In milliseconds */ + +#define PYX_TRANSPORT_STATUS_INTERVAL 5 /* In seconds */ + +#define PYX_TRANSPORT_SENT_TO_TRANSPORT 0 +#define PYX_TRANSPORT_WRITE_PENDING 1 + +#define PYX_TRANSPORT_UNKNOWN_SAM_OPCODE -1 +#define PYX_TRANSPORT_HBA_QUEUE_FULL -2 +#define PYX_TRANSPORT_REQ_TOO_MANY_SECTORS -3 +#define PYX_TRANSPORT_OUT_OF_MEMORY_RESOURCES -4 +#define PYX_TRANSPORT_INVALID_CDB_FIELD -5 +#define PYX_TRANSPORT_INVALID_PARAMETER_LIST -6 +#define PYX_TRANSPORT_LU_COMM_FAILURE -7 +#define PYX_TRANSPORT_UNKNOWN_MODE_PAGE -8 +#define PYX_TRANSPORT_WRITE_PROTECTED -9 +#define PYX_TRANSPORT_TASK_TIMEOUT -10 +#define PYX_TRANSPORT_RESERVATION_CONFLICT -11 +#define PYX_TRANSPORT_ILLEGAL_REQUEST -12 +#define PYX_TRANSPORT_USE_SENSE_REASON -13 + +#ifndef SAM_STAT_RESERVATION_CONFLICT +#define SAM_STAT_RESERVATION_CONFLICT 0x18 +#endif + +#define TRANSPORT_PLUGIN_FREE 0 +#define TRANSPORT_PLUGIN_REGISTERED 1 + +#define TRANSPORT_PLUGIN_PHBA_PDEV 1 +#define TRANSPORT_PLUGIN_VHBA_PDEV 2 +#define TRANSPORT_PLUGIN_VHBA_VDEV 3 + +/* For SE OBJ Plugins, in seconds */ +#define TRANSPORT_TIMEOUT_TUR 10 +#define TRANSPORT_TIMEOUT_TYPE_DISK 60 +#define TRANSPORT_TIMEOUT_TYPE_ROM 120 +#define TRANSPORT_TIMEOUT_TYPE_TAPE 600 +#define TRANSPORT_TIMEOUT_TYPE_OTHER 300 + +/* For se_task->task_state_flags */ +#define TSF_EXCEPTION_CLEARED 0x01 + +/* + * struct se_subsystem_dev->su_dev_flags +*/ +#define SDF_FIRMWARE_VPD_UNIT_SERIAL 0x00000001 +#define SDF_EMULATED_VPD_UNIT_SERIAL 0x00000002 +#define SDF_USING_UDEV_PATH 0x00000004 +#define SDF_USING_ALIAS 0x00000008 + +/* + * struct se_device->dev_flags + */ +#define DF_READ_ONLY 0x00000001 +#define DF_SPC2_RESERVATIONS 0x00000002 +#define DF_SPC2_RESERVATIONS_WITH_ISID 0x00000004 + +/* struct se_dev_attrib sanity values */ +/* 10 Minutes */ +#define DA_TASK_TIMEOUT_MAX 600 +/* Default max_unmap_lba_count */ +#define DA_MAX_UNMAP_LBA_COUNT 0 +/* Default max_unmap_block_desc_count */ +#define DA_MAX_UNMAP_BLOCK_DESC_COUNT 0 +/* Default unmap_granularity */ +#define DA_UNMAP_GRANULARITY_DEFAULT 0 +/* Default unmap_granularity_alignment */ +#define DA_UNMAP_GRANULARITY_ALIGNMENT_DEFAULT 0 +/* Emulation for Direct Page Out */ +#define DA_EMULATE_DPO 0 +/* Emulation for Forced Unit Access WRITEs */ +#define DA_EMULATE_FUA_WRITE 1 +/* Emulation for Forced Unit Access READs */ +#define DA_EMULATE_FUA_READ 0 +/* Emulation for WriteCache and SYNCHRONIZE_CACHE */ +#define DA_EMULATE_WRITE_CACHE 0 +/* Emulation for UNIT ATTENTION Interlock Control */ +#define DA_EMULATE_UA_INTLLCK_CTRL 0 +/* Emulation for TASK_ABORTED status (TAS) by default */ +#define DA_EMULATE_TAS 1 +/* Emulation for Thin Provisioning UNMAP using block/blk-lib.c:blkdev_issue_discard() */ +#define DA_EMULATE_TPU 0 +/* + * Emulation for Thin Provisioning WRITE_SAME w/ UNMAP=1 bit using + * block/blk-lib.c:blkdev_issue_discard() + */ +#define DA_EMULATE_TPWS 0 +/* No Emulation for PSCSI by default */ +#define DA_EMULATE_RESERVATIONS 0 +/* No Emulation for PSCSI by default */ +#define DA_EMULATE_ALUA 0 +/* Enforce SCSI Initiator Port TransportID with 'ISID' for PR */ +#define DA_ENFORCE_PR_ISIDS 1 +#define DA_STATUS_MAX_SECTORS_MIN 16 +#define DA_STATUS_MAX_SECTORS_MAX 8192 + +#define SE_MODE_PAGE_BUF 512 + +#define MOD_MAX_SECTORS(ms, bs) (ms % (PAGE_SIZE / bs)) + +struct se_mem; +struct se_subsystem_api; + +extern int init_se_global(void); +extern void release_se_global(void); +extern void transport_init_queue_obj(struct se_queue_obj *); +extern int transport_subsystem_check_init(void); +extern int transport_subsystem_register(struct se_subsystem_api *); +extern void transport_subsystem_release(struct se_subsystem_api *); +extern void transport_load_plugins(void); +extern struct se_session *transport_init_session(void); +extern void __transport_register_session(struct se_portal_group *, + struct se_node_acl *, + struct se_session *, void *); +extern void transport_register_session(struct se_portal_group *, + struct se_node_acl *, + struct se_session *, void *); +extern void transport_free_session(struct se_session *); +extern void transport_deregister_session_configfs(struct se_session *); +extern void transport_deregister_session(struct se_session *); +extern void transport_cmd_finish_abort(struct se_cmd *, int); +extern void transport_cmd_finish_abort_tmr(struct se_cmd *); +extern void transport_complete_sync_cache(struct se_cmd *, int); +extern void transport_complete_task(struct se_task *, int); +extern void transport_add_task_to_execute_queue(struct se_task *, + struct se_task *, + struct se_device *); +unsigned char *transport_dump_cmd_direction(struct se_cmd *); +extern void transport_dump_dev_state(struct se_device *, char *, int *); +extern void transport_dump_dev_info(struct se_device *, struct se_lun *, + unsigned long long, char *, int *); +extern void transport_dump_vpd_proto_id(struct t10_vpd *, + unsigned char *, int); +extern void transport_set_vpd_proto_id(struct t10_vpd *, unsigned char *); +extern int transport_dump_vpd_assoc(struct t10_vpd *, + unsigned char *, int); +extern int transport_set_vpd_assoc(struct t10_vpd *, unsigned char *); +extern int transport_dump_vpd_ident_type(struct t10_vpd *, + unsigned char *, int); +extern int transport_set_vpd_ident_type(struct t10_vpd *, unsigned char *); +extern int transport_dump_vpd_ident(struct t10_vpd *, + unsigned char *, int); +extern int transport_set_vpd_ident(struct t10_vpd *, unsigned char *); +extern struct se_device *transport_add_device_to_core_hba(struct se_hba *, + struct se_subsystem_api *, + struct se_subsystem_dev *, u32, + void *, struct se_dev_limits *, + const char *, const char *); +extern void transport_device_setup_cmd(struct se_cmd *); +extern void transport_init_se_cmd(struct se_cmd *, + struct target_core_fabric_ops *, + struct se_session *, u32, int, int, + unsigned char *); +extern void transport_free_se_cmd(struct se_cmd *); +extern int transport_generic_allocate_tasks(struct se_cmd *, unsigned char *); +extern int transport_generic_handle_cdb(struct se_cmd *); +extern int transport_generic_handle_cdb_map(struct se_cmd *); +extern int transport_generic_handle_data(struct se_cmd *); +extern void transport_new_cmd_failure(struct se_cmd *); +extern int transport_generic_handle_tmr(struct se_cmd *); +extern void __transport_stop_task_timer(struct se_task *, unsigned long *); +extern unsigned char transport_asciihex_to_binaryhex(unsigned char val[2]); +extern int transport_generic_map_mem_to_cmd(struct se_cmd *cmd, struct scatterlist *, u32, + struct scatterlist *, u32); +extern int transport_clear_lun_from_sessions(struct se_lun *); +extern int transport_check_aborted_status(struct se_cmd *, int); +extern int transport_send_check_condition_and_sense(struct se_cmd *, u8, int); +extern void transport_send_task_abort(struct se_cmd *); +extern void transport_release_cmd_to_pool(struct se_cmd *); +extern void transport_generic_free_cmd(struct se_cmd *, int, int, int); +extern void transport_generic_wait_for_cmds(struct se_cmd *, int); +extern u32 transport_calc_sg_num(struct se_task *, struct se_mem *, u32); +extern int transport_map_mem_to_sg(struct se_task *, struct list_head *, + void *, struct se_mem *, + struct se_mem **, u32 *, u32 *); +extern void transport_do_task_sg_chain(struct se_cmd *); +extern void transport_generic_process_write(struct se_cmd *); +extern int transport_generic_do_tmr(struct se_cmd *); +/* From target_core_alua.c */ +extern int core_alua_check_nonop_delay(struct se_cmd *); + +/* + * Each se_transport_task_t can have N number of possible struct se_task's + * for the storage transport(s) to possibly execute. + * Used primarily for splitting up CDBs that exceed the physical storage + * HBA's maximum sector count per task. + */ +struct se_mem { + struct page *se_page; + u32 se_len; + u32 se_off; + struct list_head se_list; +} ____cacheline_aligned; + +/* + * Each type of disk transport supported MUST have a template defined + * within its .h file. + */ +struct se_subsystem_api { + /* + * The Name. :-) + */ + char name[16]; + /* + * Transport Type. + */ + u8 transport_type; + /* + * struct module for struct se_hba references + */ + struct module *owner; + /* + * Used for global se_subsystem_api list_head + */ + struct list_head sub_api_list; + /* + * For SCF_SCSI_NON_DATA_CDB + */ + int (*cdb_none)(struct se_task *); + /* + * For SCF_SCSI_CONTROL_NONSG_IO_CDB + */ + int (*map_task_non_SG)(struct se_task *); + /* + * For SCF_SCSI_DATA_SG_IO_CDB and SCF_SCSI_CONTROL_SG_IO_CDB + */ + int (*map_task_SG)(struct se_task *); + /* + * attach_hba(): + */ + int (*attach_hba)(struct se_hba *, u32); + /* + * detach_hba(): + */ + void (*detach_hba)(struct se_hba *); + /* + * pmode_hba(): Used for TCM/pSCSI subsystem plugin HBA -> + * Linux/SCSI struct Scsi_Host passthrough + */ + int (*pmode_enable_hba)(struct se_hba *, unsigned long); + /* + * allocate_virtdevice(): + */ + void *(*allocate_virtdevice)(struct se_hba *, const char *); + /* + * create_virtdevice(): Only for Virtual HBAs + */ + struct se_device *(*create_virtdevice)(struct se_hba *, + struct se_subsystem_dev *, void *); + /* + * free_device(): + */ + void (*free_device)(void *); + + /* + * dpo_emulated(): + */ + int (*dpo_emulated)(struct se_device *); + /* + * fua_write_emulated(): + */ + int (*fua_write_emulated)(struct se_device *); + /* + * fua_read_emulated(): + */ + int (*fua_read_emulated)(struct se_device *); + /* + * write_cache_emulated(): + */ + int (*write_cache_emulated)(struct se_device *); + /* + * transport_complete(): + * + * Use transport_generic_complete() for majority of DAS transport + * drivers. Provided out of convenience. + */ + int (*transport_complete)(struct se_task *task); + struct se_task *(*alloc_task)(struct se_cmd *); + /* + * do_task(): + */ + int (*do_task)(struct se_task *); + /* + * Used by virtual subsystem plugins IBLOCK and FILEIO to emulate + * UNMAP and WRITE_SAME_* w/ UNMAP=1 <-> Linux/Block Discard + */ + int (*do_discard)(struct se_device *, sector_t, u32); + /* + * Used by virtual subsystem plugins IBLOCK and FILEIO to emulate + * SYNCHRONIZE_CACHE_* <-> Linux/Block blkdev_issue_flush() + */ + void (*do_sync_cache)(struct se_task *); + /* + * free_task(): + */ + void (*free_task)(struct se_task *); + /* + * check_configfs_dev_params(): + */ + ssize_t (*check_configfs_dev_params)(struct se_hba *, struct se_subsystem_dev *); + /* + * set_configfs_dev_params(): + */ + ssize_t (*set_configfs_dev_params)(struct se_hba *, struct se_subsystem_dev *, + const char *, ssize_t); + /* + * show_configfs_dev_params(): + */ + ssize_t (*show_configfs_dev_params)(struct se_hba *, struct se_subsystem_dev *, + char *); + /* + * get_cdb(): + */ + unsigned char *(*get_cdb)(struct se_task *); + /* + * get_device_rev(): + */ + u32 (*get_device_rev)(struct se_device *); + /* + * get_device_type(): + */ + u32 (*get_device_type)(struct se_device *); + /* + * Get the sector_t from a subsystem backstore.. + */ + sector_t (*get_blocks)(struct se_device *); + /* + * do_se_mem_map(): + */ + int (*do_se_mem_map)(struct se_task *, struct list_head *, void *, + struct se_mem *, struct se_mem **, u32 *, u32 *); + /* + * get_sense_buffer(): + */ + unsigned char *(*get_sense_buffer)(struct se_task *); +} ____cacheline_aligned; + +#define TRANSPORT(dev) ((dev)->transport) +#define HBA_TRANSPORT(hba) ((hba)->transport) + +extern struct se_global *se_global; + +#endif /* TARGET_CORE_TRANSPORT_H */ -- cgit v1.1 From c94fbe1d9e1e9b1a1f82eb0b53b1cf53bcf9712b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Jan 2011 11:25:58 -0500 Subject: tracing: Only process module tracepoints once The commit: 9f987b3141f086de27832514aad9f50a53f754 tracing: Include module.h in define_trace.h only solved half the problem. If the trace/events/module.h header is included at the time of define_trace.h (or in ftrace.h within it), the module.h TRACE_SYSTEM will override the current TRACE_SYSTEM macro. Since define_trace.h is included when CREATE_TRACE_POINTS is set, and the first thing it does is to #undef CREATE_TRACE_POINTS, by placing the module.h TRACE_SYSTEM inside a #ifdef CREATE_TRACE_POINTS we can prevent it from overriding the TRACE_SYSTEM that is being processed, and still process the module.h tracepoints when the module code defines CREATE_TRACE_POINTS and includes the trace/events/module.h header. As with commit 9f987b3141, this is only an issue if module.h is not included before the trace/events/.h file is included, which (luckily) has not happened yet. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/module.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/trace/events/module.h b/include/trace/events/module.h index c7bb2f0..c6bae36 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -1,5 +1,15 @@ +/* + * Because linux/module.h has tracepoints in the header, and ftrace.h + * eventually includes this file, define_trace.h includes linux/module.h + * But we do not want the module.h to override the TRACE_SYSTEM macro + * variable that define_trace.h is processing, so we only set it + * when module events are being processed, which would happen when + * CREATE_TRACE_POINTS is defined. + */ +#ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module +#endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H -- cgit v1.1 From 415e12b2379239973feab91850b0dce985c6058a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jan 2011 00:55:09 +0100 Subject: PCI/ACPI: Request _OSC control once for each root bridge (v3) Move the evaluation of acpi_pci_osc_control_set() (to request control of PCI Express native features) into acpi_pci_root_add() to avoid calling it many times for the same root complex with the same arguments. Additionally, check if all of the requisite _OSC support bits are set before calling acpi_pci_osc_control_set() for a given root complex. References: https://bugzilla.kernel.org/show_bug.cgi?id=20232 Reported-by: Ozan Caglayan Tested-by: Ozan Caglayan Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- include/acpi/apei.h | 6 ++++++ include/linux/pci-acpi.h | 6 ++++++ include/linux/pci.h | 11 +++++++++++ 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/acpi/apei.h b/include/acpi/apei.h index b336502..c4dbb13 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -19,6 +19,12 @@ extern int hest_disable; extern int erst_disable; +#ifdef CONFIG_ACPI_APEI +void __init acpi_hest_init(void); +#else +static inline void acpi_hest_init(void) { return; } +#endif + typedef int (*apei_hest_func_t)(struct acpi_hest_header *hest_hdr, void *data); int apei_hest_parse(apei_hest_func_t func, void *data); diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index c8b6473..479d9bb 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -40,4 +40,10 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { return NULL; } #endif +#ifdef CONFIG_ACPI_APEI +extern bool aer_acpi_firmware_first(void); +#else +static inline bool aer_acpi_firmware_first(void) { return false; } +#endif + #endif /* _PCI_ACPI_H_ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 63cbadc..12dd86a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -994,6 +994,9 @@ extern void pci_restore_msi_state(struct pci_dev *dev); extern int pci_msi_enabled(void); #endif +extern bool pcie_ports_disabled; +extern bool pcie_ports_auto; + #ifndef CONFIG_PCIEASPM static inline int pcie_aspm_enabled(void) { @@ -1003,6 +1006,14 @@ static inline int pcie_aspm_enabled(void) extern int pcie_aspm_enabled(void); #endif +#ifdef CONFIG_PCIEAER +void pci_no_aer(void); +bool pci_aer_available(void); +#else +static inline void pci_no_aer(void) { } +static inline bool pci_aer_available(void) { return false; } +#endif + #ifndef CONFIG_PCIE_ECRC static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { -- cgit v1.1 From b6e335aeeb114dccb07eaa09e8b62ff9510cf745 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Dec 2010 13:21:23 +0100 Subject: PCI/PM: Use pm_wakeup_event() directly for reporting wakeup events After recent changes related to wakeup events pm_wakeup_event() automatically checks if the given device is configured to signal wakeup, so pci_wakeup_event() may be a static inline function calling pm_wakeup_event() directly. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 12dd86a..9e67dcd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -820,7 +820,6 @@ int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); -void pci_wakeup_event(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, -- cgit v1.1 From 49731baa41df404c2c3f44555869ab387363af43 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Jan 2011 18:43:57 +0100 Subject: block: restore multiple bd_link_disk_holder() support Commit e09b457b (block: simplify holder symlink handling) incorrectly assumed that there is only one link at maximum. dm may use multiple links and expects block layer to track reference count for each link, which is different from and unrelated to the exclusive device holder identified by @holder when the device is opened. Remove the single holder assumption and automatic removal of the link and revive the per-link reference count tracking. The code essentially behaves the same as before commit e09b457b sans the unnecessary kobject reference count dancing. While at it, note that this facility should not be used by anyone else than the current ones. Sysfs symlinks shouldn't be abused like this and the whole thing doesn't belong in the block layer at all. Signed-off-by: Tejun Heo Reported-by: Milan Broz Cc: Jun'ichi Nomura Cc: Neil Brown Cc: linux-raid@vger.kernel.org Cc: Kay Sievers Signed-off-by: Jens Axboe --- include/linux/fs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3984f23..fb21903 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -666,7 +666,7 @@ struct block_device { int bd_holders; bool bd_write_holder; #ifdef CONFIG_SYSFS - struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ + struct list_head bd_holder_disks; #endif struct block_device * bd_contains; unsigned bd_block_size; @@ -2058,12 +2058,18 @@ extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +extern void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } +static inline void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ +} #endif #endif -- cgit v1.1 From 3e8b3b90fecedcf20d895c4e6ad01a379fe252bf Mon Sep 17 00:00:00 2001 From: Hanno Boeck Date: Fri, 14 Jan 2011 19:14:47 +0100 Subject: ALSA: constify functions in ac97 Signed-off-by: Hanno Boeck Signed-off-by: Takashi Iwai --- include/sound/ac97_codec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/ac97_codec.h b/include/sound/ac97_codec.h index 4940045..b602f47 100644 --- a/include/sound/ac97_codec.h +++ b/include/sound/ac97_codec.h @@ -477,7 +477,7 @@ struct snd_ac97_template { struct snd_ac97 { /* -- lowlevel (hardware) driver specific -- */ - struct snd_ac97_build_ops * build_ops; + const struct snd_ac97_build_ops *build_ops; void *private_data; void (*private_free) (struct snd_ac97 *ac97); /* --- */ -- cgit v1.1 From ab0724ffee24409a7f81afb539b2ac29090bff3e Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Sat, 15 Jan 2011 00:07:22 +0100 Subject: PCI / ACPI: Fix build issue in pci_root.c for !CONFIG_PCIEPORTBUS The compilation of drivers/acpi/pci_root.c fails if CONFIG_PCIEPORTBUS is unset. Fix the problem. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- include/linux/pci.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e67dcd..559d028 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -993,8 +993,13 @@ extern void pci_restore_msi_state(struct pci_dev *dev); extern int pci_msi_enabled(void); #endif +#ifdef CONFIG_PCIEPORTBUS extern bool pcie_ports_disabled; extern bool pcie_ports_auto; +#else +#define pcie_ports_disabled true +#define pcie_ports_auto false +#endif #ifndef CONFIG_PCIEASPM static inline int pcie_aspm_enabled(void) -- cgit v1.1 From 0029227f1bc30b6c809ae751f9e7af6cef900997 Mon Sep 17 00:00:00 2001 From: Andiry Xu Date: Mon, 27 Dec 2010 17:39:02 +0800 Subject: xHCI: synchronize irq in xhci_suspend() Synchronize the interrupts instead of free them in xhci_suspend(). This will prevent a double free when the host is suspended and then the card removed. Set the flag hcd->msix_enabled when using MSI-X, and check the flag in suspend_common(). MSI-X synchronization will be handled by xhci_suspend(), and MSI/INTx will be synchronized in suspend_common(). This patch should be queued for the 2.6.37 stable tree. Reported-by: Matthew Garrett Signed-off-by: Andiry Xu Signed-off-by: Sarah Sharp Cc: stable@kernel.org --- include/linux/usb/hcd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index dd6ee49..a854fe8 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -112,6 +112,7 @@ struct usb_hcd { /* Flags that get set only during HCD registration or removal. */ unsigned rh_registered:1;/* is root hub registered? */ unsigned rh_pollable:1; /* may we poll the root hub? */ + unsigned msix_enabled:1; /* driver has MSI-X enabled? */ /* The next flag is a stopgap, to be removed when all the HCDs * support the new root-hub polling mechanism. */ -- cgit v1.1 From 3632ef8909118db9584e1bed9538dc180adb32f8 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Sat, 15 Jan 2011 09:27:00 +1000 Subject: Revert "drm: Update fbdev fb_fix_screeninfo" This reverts commit dfe63bb0ad9810db13aab0058caba97866e0a681. This commit was causing nouveau not to work properly, for -rc1 I'd prefer it worked and we can look if this is useful for 2.6.39. Cc: James Simmons Signed-off-by: Dave Airlie Signed-off-by: Linus Torvalds --- include/drm/drm_fb_helper.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index aac27bd..f22e7fe 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -121,6 +121,9 @@ int drm_fb_helper_setcolreg(unsigned regno, void drm_fb_helper_restore(void); void drm_fb_helper_fill_var(struct fb_info *info, struct drm_fb_helper *fb_helper, uint32_t fb_width, uint32_t fb_height); +void drm_fb_helper_fill_fix(struct fb_info *info, uint32_t pitch, + uint32_t depth); + int drm_fb_helper_setcmap(struct fb_cmap *cmap, struct fb_info *info); bool drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); -- cgit v1.1 From 98d530fe246b65fbd3cdeeeca319a80c46cb4793 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Sat, 1 Jan 2011 23:00:23 +0000 Subject: Fix dmaengine_submit() return type desc->tx_submit's return type is dma_cookie_t, not int. Therefore, dmaengine_submit() should match this return type as it's just wrapping this detail. Signed-off-by: Russell King Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9d8688b..830935b 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -532,7 +532,7 @@ static inline int dmaengine_resume(struct dma_chan *chan) return dmaengine_device_control(chan, DMA_RESUME, 0); } -static inline int dmaengine_submit(struct dma_async_tx_descriptor *desc) +static inline dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc) { return desc->tx_submit(desc); } -- cgit v1.1 From 96a608a4bfd8468c21881b3f92024923886eb015 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 14 Jan 2011 17:51:11 -0800 Subject: ARM: PL08x: fix a warning drivers/dma/amba-pl08x.c: In function 'pl08x_start_txd': drivers/dma/amba-pl08x.c:205: warning: dereferencing 'void *' pointer We never dereference llis_va aside from assigning it to a struct pl08x_lli pointer or calculating the address of array element 0. Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 933b4ed..5b87b6a 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -118,7 +118,7 @@ struct pl08x_txd { dma_addr_t dst_addr; size_t len; dma_addr_t llis_bus; - void *llis_va; + struct pl08x_lli *llis_va; bool active; /* Default cctl value for LLIs */ u32 cctl; -- cgit v1.1 From 9875cf806403fae66b2410a3c2cc820d97731e04 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 18:45:21 +0000 Subject: Add a dentry op to handle automounting rather than abusing follow_link() Add a dentry op (d_automount) to handle automounting directories rather than abusing the follow_link() inode operation. The operation is keyed off a new dentry flag (DCACHE_NEED_AUTOMOUNT). This also makes it easier to add an AT_ flag to suppress terminal segment automount during pathwalk and removes the need for the kludge code in the pathwalk algorithm to handle directories with follow_link() semantics. The ->d_automount() dentry operation: struct vfsmount *(*d_automount)(struct path *mountpoint); takes a pointer to the directory to be mounted upon, which is expected to provide sufficient data to determine what should be mounted. If successful, it should return the vfsmount struct it creates (which it should also have added to the namespace using do_add_mount() or similar). If there's a collision with another automount attempt, NULL should be returned. If the directory specified by the parameter should be used directly rather than being mounted upon, -EISDIR should be returned. In any other case, an error code should be returned. The ->d_automount() operation is called with no locks held and may sleep. At this point the pathwalk algorithm will be in ref-walk mode. Within fs/namei.c itself, a new pathwalk subroutine (follow_automount()) is added to handle mountpoints. It will return -EREMOTE if the automount flag was set, but no d_automount() op was supplied, -ELOOP if we've encountered too many symlinks or mountpoints, -EISDIR if the walk point should be used without mounting and 0 if successful. The path will be updated to point to the mounted filesystem if a successful automount took place. __follow_mount() is replaced by follow_managed() which is more generic (especially with the patch that adds ->d_manage()). This handles transits from directories during pathwalk, including automounting and skipping over mountpoints (and holding processes with the next patch). __follow_mount_rcu() will jump out of RCU-walk mode if it encounters an automount point with nothing mounted on it. follow_dotdot*() does not handle automounts as you don't want to trigger them whilst following "..". I've also extracted the mount/don't-mount logic from autofs4 and included it here. It makes the mount go ahead anyway if someone calls open() or creat(), tries to traverse the directory, tries to chdir/chroot/etc. into the directory, or sticks a '/' on the end of the pathname. If they do a stat(), however, they'll only trigger the automount if they didn't also say O_NOFOLLOW. I've also added an inode flag (S_AUTOMOUNT) so that filesystems can mark their inodes as automount points. This flag is automatically propagated to the dentry as DCACHE_NEED_AUTOMOUNT by __d_instantiate(). This saves NFS and could save AFS a private flag bit apiece, but is not strictly necessary. It would be preferable to do the propagation in d_set_d_op(), but that doesn't normally have access to the inode. [AV: fixed breakage in case if __follow_mount_rcu() fails and nameidata_drop_rcu() succeeds in RCU case of do_lookup(); we need to fall through to non-RCU case after that, rather than just returning with ungrabbed *path] Signed-off-by: David Howells Was-Acked-by: Ian Kent Signed-off-by: Al Viro --- include/linux/dcache.h | 7 ++++++- include/linux/fs.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 59fcd24..ee6c26d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -167,6 +167,7 @@ struct dentry_operations { void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); + struct vfsmount *(*d_automount)(struct path *); } ____cacheline_aligned; /* @@ -205,13 +206,17 @@ struct dentry_operations { #define DCACHE_CANT_MOUNT 0x0100 #define DCACHE_GENOCIDE 0x0200 -#define DCACHE_MOUNTED 0x0400 /* is a mountpoint */ #define DCACHE_OP_HASH 0x1000 #define DCACHE_OP_COMPARE 0x2000 #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 +#define DCACHE_MOUNTED 0x10000 /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT 0x20000 /* handle automount on this dir */ +#define DCACHE_MANAGED_DENTRY \ + (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT) + extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) diff --git a/include/linux/fs.h b/include/linux/fs.h index 3984f23..4c00ed5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -242,6 +242,7 @@ struct inodes_stat_t { #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE 512 /* Inode is fs-internal */ #define S_IMA 1024 /* Inode has an associated IMA struct */ +#define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -277,6 +278,7 @@ struct inodes_stat_t { #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) +#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ -- cgit v1.1 From cc53ce53c86924bfe98a12ea20b7465038a08792 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 18:45:26 +0000 Subject: Add a dentry op to allow processes to be held during pathwalk transit Add a dentry op (d_manage) to permit a filesystem to hold a process and make it sleep when it tries to transit away from one of that filesystem's directories during a pathwalk. The operation is keyed off a new dentry flag (DCACHE_MANAGE_TRANSIT). The filesystem is allowed to be selective about which processes it holds and which it permits to continue on or prohibits from transiting from each flagged directory. This will allow autofs to hold up client processes whilst letting its userspace daemon through to maintain the directory or the stuff behind it or mounted upon it. The ->d_manage() dentry operation: int (*d_manage)(struct path *path, bool mounting_here); takes a pointer to the directory about to be transited away from and a flag indicating whether the transit is undertaken by do_add_mount() or do_move_mount() skipping through a pile of filesystems mounted on a mountpoint. It should return 0 if successful and to let the process continue on its way; -EISDIR to prohibit the caller from skipping to overmounted filesystems or automounting, and to use this directory; or some other error code to return to the user. ->d_manage() is called with namespace_sem writelocked if mounting_here is true and no other locks held, so it may sleep. However, if mounting_here is true, it may not initiate or wait for a mount or unmount upon the parameter directory, even if the act is actually performed by userspace. Within fs/namei.c, follow_managed() is extended to check with d_manage() first on each managed directory, before transiting away from it or attempting to automount upon it. follow_down() is renamed follow_down_one() and should only be used where the filesystem deliberately intends to avoid management steps (e.g. autofs). A new follow_down() is added that incorporates the loop done by all other callers of follow_down() (do_add/move_mount(), autofs and NFSD; whilst AFS, NFS and CIFS do use it, their use is removed by converting them to use d_automount()). The new follow_down() calls d_manage() as appropriate. It also takes an extra parameter to indicate if it is being called from mount code (with namespace_sem writelocked) which it passes to d_manage(). follow_down() ignores automount points so that it can be used to mount on them. __follow_mount_rcu() is made to abort rcu-walk mode if it hits a directory with DCACHE_MANAGE_TRANSIT set on the basis that we're probably going to have to sleep. It would be possible to enter d_manage() in rcu-walk mode too, and have that determine whether to abort or not itself. That would allow the autofs daemon to continue on in rcu-walk mode. Note that DCACHE_MANAGE_TRANSIT on a directory should be cleared when it isn't required as every tranist from that directory will cause d_manage() to be invoked. It can always be set again when necessary. ========================== WHAT THIS MEANS FOR AUTOFS ========================== Autofs currently uses the lookup() inode op and the d_revalidate() dentry op to trigger the automounting of indirect mounts, and both of these can be called with i_mutex held. autofs knows that the i_mutex will be held by the caller in lookup(), and so can drop it before invoking the daemon - but this isn't so for d_revalidate(), since the lock is only held on _some_ of the code paths that call it. This means that autofs can't risk dropping i_mutex from its d_revalidate() function before it calls the daemon. The bug could manifest itself as, for example, a process that's trying to validate an automount dentry that gets made to wait because that dentry is expired and needs cleaning up: mkdir S ffffffff8014e05a 0 32580 24956 Call Trace: [] :autofs4:autofs4_wait+0x674/0x897 [] avc_has_perm+0x46/0x58 [] autoremove_wake_function+0x0/0x2e [] :autofs4:autofs4_expire_wait+0x41/0x6b [] :autofs4:autofs4_revalidate+0x91/0x149 [] __lookup_hash+0xa0/0x12f [] lookup_create+0x46/0x80 [] sys_mkdirat+0x56/0xe4 versus the automount daemon which wants to remove that dentry, but can't because the normal process is holding the i_mutex lock: automount D ffffffff8014e05a 0 32581 1 32561 Call Trace: [] __mutex_lock_slowpath+0x60/0x9b [] do_path_lookup+0x2ca/0x2f1 [] .text.lock.mutex+0xf/0x14 [] do_rmdir+0x77/0xde [] tracesys+0x71/0xe0 [] tracesys+0xd5/0xe0 which means that the system is deadlocked. This patch allows autofs to hold up normal processes whilst the daemon goes ahead and does things to the dentry tree behind the automouter point without risking a deadlock as almost no locks are held in d_manage() and none in d_automount(). Signed-off-by: David Howells Was-Acked-by: Ian Kent Signed-off-by: Al Viro --- include/linux/dcache.h | 11 +++++++++-- include/linux/namei.h | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ee6c26d..1a87760 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -168,6 +168,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); + int (*d_manage)(struct dentry *, bool); } ____cacheline_aligned; /* @@ -214,8 +215,9 @@ struct dentry_operations { #define DCACHE_MOUNTED 0x10000 /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT 0x20000 /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT 0x40000 /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ - (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT) + (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) extern seqlock_t rename_lock; @@ -404,7 +406,12 @@ static inline void dont_mount(struct dentry *dentry) extern void dput(struct dentry *); -static inline int d_mountpoint(struct dentry *dentry) +static inline bool d_managed(struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_MANAGED_DENTRY; +} + +static inline bool d_mountpoint(struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 18d06ad..8ef2c78 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,7 +79,8 @@ extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -extern int follow_down(struct path *); +extern int follow_down_one(struct path *); +extern int follow_down(struct path *, bool); extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); -- cgit v1.1 From 6f45b65672c8017d5e210e338bb5858a938ef445 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 18:45:31 +0000 Subject: Add an AT_NO_AUTOMOUNT flag to suppress terminal automount Add an AT_NO_AUTOMOUNT flag to suppress terminal automounting of automount point directories. This can be used by fstatat() users to permit the gathering of attributes on an automount point and also prevent mass-automounting of a directory of automount points by ls. Signed-off-by: David Howells Acked-by: Ian Kent Signed-off-by: Al Viro --- include/linux/fcntl.h | 1 + include/linux/namei.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index afc00af..a562fa5 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -45,6 +45,7 @@ #define AT_REMOVEDIR 0x200 /* Remove directory instead of unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ #ifdef __KERNEL__ diff --git a/include/linux/namei.h b/include/linux/namei.h index 8ef2c78..f276d4f 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -45,6 +45,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - ending slashes ok even for nonexistent files * - internal "there are more path components" flag * - dentry cache is untrusted; force a real lookup + * - suppress terminal automount */ #define LOOKUP_FOLLOW 0x0001 #define LOOKUP_DIRECTORY 0x0002 @@ -53,6 +54,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 #define LOOKUP_RCU 0x0040 +#define LOOKUP_NO_AUTOMOUNT 0x0080 /* * Intent data */ -- cgit v1.1 From 36d43a43761b004ad1879ac21471d8fc5f3157ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 18:45:42 +0000 Subject: NFS: Use d_automount() rather than abusing follow_link() Make NFS use the new d_automount() dentry operation rather than abusing follow_link() on directories. Signed-off-by: David Howells Acked-by: Trond Myklebust Acked-by: Ian Kent Signed-off-by: Al Viro --- include/linux/nfs_fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 0779bb8..6023efa 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -215,7 +215,6 @@ struct nfs_inode { #define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ -#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ -- cgit v1.1 From 1972580bb4edea3ed6fe273b2ca72f44f10f8c86 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 14 Jan 2011 18:46:40 +0000 Subject: autofs4: Bump version Increase the autofs module sub-version so we can tell what kernel implementation is being used from user space debug logging. Signed-off-by: Ian Kent Signed-off-by: David Howells Signed-off-by: Al Viro --- include/linux/auto_fs4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index 8b49ac4..e02982f 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -24,7 +24,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 1 +#define AUTOFS_PROTO_SUBVERSION 2 /* Mask for expire behaviour */ #define AUTOFS_EXP_IMMEDIATE 1 -- cgit v1.1 From ab90911ff90cdab59b31c045c3f0ae480d14f29d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 18:46:51 +0000 Subject: Allow d_manage() to be used in RCU-walk mode Allow d_manage() to be called from pathwalk when it is in RCU-walk mode as well as when it is in Ref-walk mode. This permits __follow_mount_rcu() to call d_manage() directly. d_manage() needs a parameter to indicate that it is in RCU-walk mode as it isn't allowed to sleep if in that mode (but should return -ECHILD instead). autofs4_d_manage() can then be set to retain RCU-walk mode if the daemon accesses it and otherwise request dropping back to ref-walk mode. Signed-off-by: David Howells Signed-off-by: Al Viro --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1a87760..f958c19 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -168,7 +168,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(struct dentry *, bool, bool); } ____cacheline_aligned; /* -- cgit v1.1 From ea5b778a8b98c85a87d66bf844904f9c3802b869 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2011 19:10:03 +0000 Subject: Unexport do_add_mount() and add in follow_automount(), not ->d_automount() Unexport do_add_mount() and make ->d_automount() return the vfsmount to be added rather than calling do_add_mount() itself. follow_automount() will then do the addition. This slightly complicates things as ->d_automount() normally wants to add the new vfsmount to an expiration list and start an expiration timer. The problem with that is that the vfsmount will be deleted if it has a refcount of 1 and the timer will not repeat if the expiration list is empty. To this end, we require the vfsmount to be returned from d_automount() with a refcount of (at least) 2. One of these refs will be dropped unconditionally. In addition, follow_automount() must get a 3rd ref around the call to do_add_mount() lest it eat a ref and return an error, leaving the mount we have open to being expired as we would otherwise have only 1 ref on it. d_automount() should also add the the vfsmount to the expiration list (by calling mnt_set_expiry()) and start the expiration timer before returning, if this mechanism is to be used. The vfsmount will be unlinked from the expiration list by follow_automount() if do_add_mount() fails. This patch also fixes the call to do_add_mount() for AFS to propagate the mount flags from the parent vfsmount. Signed-off-by: David Howells Signed-off-by: Al Viro --- include/linux/mount.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mount.h b/include/linux/mount.h index 1869ea2..af4765e 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -110,12 +110,7 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); -struct nameidata; - -struct path; -extern int do_add_mount(struct vfsmount *newmnt, struct path *path, - int mnt_flags, struct list_head *fslist); - +extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(char *name); -- cgit v1.1 From 672c54466d24994eb9633f993862c89539504a42 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 13 Jan 2011 15:36:09 -0700 Subject: dt/flattree: Return virtual address from early_init_dt_alloc_memory_arch() The physical address is never used by the device tree code when allocating memory for unflattening. Change the architecture's alloc hook to return the virutal address instead. Signed-off-by: Grant Likely --- include/linux/of_fdt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 0ef22a1..c84d900 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -97,7 +97,7 @@ extern void early_init_dt_check_for_initrd(unsigned long node); extern int early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data); extern void early_init_dt_add_memory_arch(u64 base, u64 size); -extern u64 early_init_dt_alloc_memory_arch(u64 size, u64 align); +extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align); extern u64 dt_mem_next_cell(int s, __be32 **cellp); /* -- cgit v1.1 From f03c65993b98eeb909a4012ce7833c5857d74755 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Jan 2011 22:30:21 -0500 Subject: sanitize vfsmount refcounting changes Instead of splitting refcount between (per-cpu) mnt_count and (SMP-only) mnt_longrefs, make all references contribute to mnt_count again and keep track of how many are longterm ones. Accounting rules for longterm count: * 1 for each fs_struct.root.mnt * 1 for each fs_struct.pwd.mnt * 1 for having non-NULL ->mnt_ns * decrement to 0 happens only under vfsmount lock exclusive That allows nice common case for mntput() - since we can't drop the final reference until after mnt_longterm has reached 0 due to the rules above, mntput() can grab vfsmount lock shared and check mnt_longterm. If it turns out to be non-zero (which is the common case), we know that this is not the final mntput() and can just blindly decrement percpu mnt_count. Otherwise we grab vfsmount lock exclusive and do usual decrement-and-check of percpu mnt_count. For fs_struct.c we have mnt_make_longterm() and mnt_make_shortterm(); namespace.c uses the latter in places where we don't already hold vfsmount lock exclusive and opencodes a few remaining spots where we need to manipulate mnt_longterm. Note that we mostly revert the code outside of fs/namespace.c back to what we used to have; in particular, normal code doesn't need to care about two kinds of references, etc. And we get to keep the optimization Nick's variant had bought us... Signed-off-by: Al Viro --- include/linux/mount.h | 4 +--- include/linux/path.h | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mount.h b/include/linux/mount.h index af4765e..604f122 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -60,7 +60,7 @@ struct vfsmount { struct super_block *mnt_sb; /* pointer to superblock */ #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; - atomic_t mnt_longrefs; + atomic_t mnt_longterm; /* how many of the refs are longterm */ #else int mnt_count; int mnt_writers; @@ -96,8 +96,6 @@ extern int mnt_clone_write(struct vfsmount *mnt); extern void mnt_drop_write(struct vfsmount *mnt); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); -extern void mntput_long(struct vfsmount *mnt); -extern struct vfsmount *mntget_long(struct vfsmount *mnt); extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); extern int __mnt_is_readonly(struct vfsmount *mnt); diff --git a/include/linux/path.h b/include/linux/path.h index a581e8c..edc98de 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -10,9 +10,7 @@ struct path { }; extern void path_get(struct path *); -extern void path_get_long(struct path *); extern void path_put(struct path *); -extern void path_put_long(struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { -- cgit v1.1 From fc8fe1e992ae0326a88edbe4d6793e840bbdd4ff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 16 Jan 2011 20:42:43 +0100 Subject: PCI / ACPI: Fix build of the AER driver for CONFIG_ACPI unset After commit 415e12b23792 ("PCI/ACPI: Request _OSC control once for each root bridge (v3)") include/linux/pci-acpi.h is included by drivers/pci/pcie/aer/aerdrv.c and if CONFIG_ACPI is unset, the bogus and unnecessary alternative definition of acpi_find_root_bridge_handle() causes a build error to occur. Remove the offending piece of garbage. Reported-and-tested-by: Stephen Rothwell Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- include/linux/pci-acpi.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 479d9bb..4462350 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -35,9 +35,6 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) return acpi_get_pci_rootbridge_handle(pci_domain_nr(pbus), pbus->number); } -#else -static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) -{ return NULL; } #endif #ifdef CONFIG_ACPI_APEI -- cgit v1.1 From b3697c0255d9d73eaaa4deb4512e3f0ff97b3b71 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sun, 16 Jan 2011 13:10:39 -0800 Subject: fix non-x86 build failure in pmdp_get_and_clear pmdp_get_and_clear/pmdp_clear_flush/pmdp_splitting_flush were trapped as BUG() and they were defined only to diminish the risk of build issues on not-x86 archs and to be consistent with the generic pte methods previously defined in include/asm-generic/pgtable.h. But they are causing more trouble than they were supposed to solve, so it's simpler not to define them when THP is off. This is also correcting the export of pmdp_splitting_flush which is currently unused (x86 isn't using the generic implementation in mm/pgtable-generic.c and no other arch needs that [yet]). Signed-off-by: Andrea Arcangeli Sam Ravnborg Cc: Stephen Rothwell Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: "Luck, Tony" Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f1eddf7..31b6188 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -87,14 +87,6 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, pmd_clear(mm, address, pmdp); return pmd; }) -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, - pmd_t *pmdp) -{ - BUG(); - return __pmd(0); -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -163,9 +155,9 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp); +extern pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME -- cgit v1.1 From 94ae85220a07d357d4937086c490854f63344de4 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Sun, 16 Jan 2011 20:18:05 +0000 Subject: ARM: PL08x: cleanup comments Cleanup the formatting of comments, remove some which don't make sense anymore. Signed-off-by: Russell King [fix conflict with 96a608a4] Signed-off-by: Dan Williams --- include/linux/amba/pl08x.h | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 5b87b6a..3111385 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -12,7 +12,6 @@ * * Please credit ARM.com * Documentation: ARM DDI 0196D - * */ #ifndef AMBA_PL08X_H @@ -55,8 +54,8 @@ enum { * @circular_buffer: whether the buffer passed in is circular and * shall simply be looped round round (like a record baby round * round round round) - * @single: the device connected to this channel will request single - * DMA transfers, not bursts. (Bursts are default.) + * @single: the device connected to this channel will request single DMA + * transfers, not bursts. (Bursts are default.) * @periph_buses: the device connected to this channel is accessible via * these buses (use PL08X_AHB1 | PL08X_AHB2). */ @@ -78,8 +77,7 @@ struct pl08x_channel_data { * @addr: current address * @maxwidth: the maximum width of a transfer on this bus * @buswidth: the width of this bus in bytes: 1, 2 or 4 - * @fill_bytes: bytes required to fill to the next bus memory - * boundary + * @fill_bytes: bytes required to fill to the next bus memory boundary */ struct pl08x_bus_data { dma_addr_t addr; @@ -92,10 +90,10 @@ struct pl08x_bus_data { * struct pl08x_phy_chan - holder for the physical channels * @id: physical index to this channel * @lock: a lock to use when altering an instance of this struct - * @signal: the physical signal (aka channel) serving this - * physical channel right now - * @serving: the virtual channel currently being served by this - * physical channel + * @signal: the physical signal (aka channel) serving this physical channel + * right now + * @serving: the virtual channel currently being served by this physical + * channel */ struct pl08x_phy_chan { unsigned int id; @@ -119,7 +117,6 @@ struct pl08x_txd { size_t len; dma_addr_t llis_bus; struct pl08x_lli *llis_va; - bool active; /* Default cctl value for LLIs */ u32 cctl; /* @@ -130,8 +127,8 @@ struct pl08x_txd { }; /** - * struct pl08x_dma_chan_state - holds the PL08x specific virtual - * channel states + * struct pl08x_dma_chan_state - holds the PL08x specific virtual channel + * states * @PL08X_CHAN_IDLE: the channel is idle * @PL08X_CHAN_RUNNING: the channel has allocated a physical transport * channel and is running a transfer on it @@ -152,7 +149,7 @@ enum pl08x_dma_chan_state { * @chan: wrappped abstract channel * @phychan: the physical channel utilized by this channel, if there is one * @phychan_hold: if non-zero, hold on to the physical channel even if we - * have no pending entries + * have no pending entries * @tasklet: tasklet scheduled by the IRQ to handle actual work etc * @name: name of channel * @cd: channel platform data @@ -166,8 +163,8 @@ enum pl08x_dma_chan_state { * @host: a pointer to the host (internal use) * @state: whether the channel is idle, paused, running etc * @slave: whether this channel is a device (slave) or for memcpy - * @waiting: a TX descriptor on this channel which is waiting for - * a physical channel to become available + * @waiting: a TX descriptor on this channel which is waiting for a physical + * channel to become available */ struct pl08x_dma_chan { struct dma_chan chan; @@ -189,16 +186,16 @@ struct pl08x_dma_chan { }; /** - * struct pl08x_platform_data - the platform configuration for the - * PL08x PrimeCells. + * struct pl08x_platform_data - the platform configuration for the PL08x + * PrimeCells. * @slave_channels: the channels defined for the different devices on the * platform, all inclusive, including multiplexed channels. The available - * physical channels will be multiplexed around these signals as they - * are requested, just enumerate all possible channels. - * @get_signal: request a physical signal to be used for a DMA - * transfer immediately: if there is some multiplexing or similar blocking - * the use of the channel the transfer can be denied by returning - * less than zero, else it returns the allocated signal number + * physical channels will be multiplexed around these signals as they are + * requested, just enumerate all possible channels. + * @get_signal: request a physical signal to be used for a DMA transfer + * immediately: if there is some multiplexing or similar blocking the use + * of the channel the transfer can be denied by returning less than zero, + * else it returns the allocated signal number * @put_signal: indicate to the platform that this physical signal is not * running any DMA transfer and multiplexing can be recycled * @lli_buses: buses which LLIs can be fetched from: PL08X_AHB1 | PL08X_AHB2 -- cgit v1.1 From f06267104dd9112f11586830d22501d0e26245ea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 15:24:36 +0000 Subject: RDMA: Update workqueue usage * ib_wq is added, which is used as the common workqueue for infiniband instead of the system workqueue. All system workqueue usages including flush_scheduled_work() callers are converted to use and flush ib_wq. * cancel_delayed_work() + flush_scheduled_work() converted to cancel_delayed_work_sync(). * qib_wq is removed and ib_wq is used instead. This is to prepare for deprecation of flush_scheduled_work(). Signed-off-by: Tejun Heo Signed-off-by: Roland Dreier --- include/rdma/ib_verbs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e04c488..55cd0a0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -47,10 +47,13 @@ #include #include #include +#include #include #include +extern struct workqueue_struct *ib_wq; + union ib_gid { u8 raw[16]; struct { -- cgit v1.1 From 2fe17c1075836b66678ed2a305fd09b6773883aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jan 2011 13:07:43 +0100 Subject: fallocate should be a file operation Currently all filesystems except XFS implement fallocate asynchronously, while XFS forced a commit. Both of these are suboptimal - in case of O_SYNC I/O we really want our allocation on disk, especially for the !KEEP_SIZE case where we actually grow the file with user-visible zeroes. On the other hand always commiting the transaction is a bad idea for fast-path uses of fallocate like for example in recent Samba versions. Given that block allocation is a data plane operation anyway change it from an inode operation to a file operation so that we have the file structure available that lets us check for O_SYNC. This also includes moving the code around for a few of the filesystems, and remove the already unnedded S_ISDIR checks given that we only wire up fallocate for regular files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 177b4dde..09b5bd6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1552,6 +1552,8 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + long (*fallocate)(struct file *file, int mode, loff_t offset, + loff_t len); }; #define IPERM_FLAG_RCU 0x0001 @@ -1582,8 +1584,6 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); void (*truncate_range)(struct inode *, loff_t, loff_t); - long (*fallocate)(struct inode *inode, int mode, loff_t offset, - loff_t len); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); } ____cacheline_aligned; -- cgit v1.1 From c2b3e74b78b24cb367289a75a2bd30e569e56e0e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Dec 2010 19:38:08 -0500 Subject: fs: Remove unlikely() from fput_light() In fput_light(), there's an unlikely(fput_needed), which running on my normal desktop doing firefox, xchat, evolution and part of my distcc farm, and running the annotate branch profiler shows that the unlikely is not very unlikely. correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 48 100 fput_light file.h 26 115828710 897415279 88 fput_light file.h 26 865271179 5286128445 85 fput_light file.h 26 19568539 8923664 31 fput_light file.h 26 12353677 3562279 22 fput_light file.h 26 267691 67062 20 fput_light file.h 26 15014853 348172 2 fput_light file.h 26 209258 205 0 fput_light file.h 26 1364164 0 0 fput_light file.h 26 Which gives 1032903812 times it was correct and 6203351846 times it was incorrect, or 85% incorrect. Cc: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Al Viro --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index b1e1297..e85baeb 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -23,7 +23,7 @@ extern struct file *alloc_file(struct path *, fmode_t mode, static inline void fput_light(struct file *file, int fput_needed) { - if (unlikely(fput_needed)) + if (fput_needed) fput(file); } -- cgit v1.1 From ecf5632dd189ab4c366cef853d6e5fe7adfe52e5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 16 Jan 2011 23:28:17 +0900 Subject: fs: fix address space warnings in ioctl_fiemap() The fi_extents_start field of struct fiemap_extent_info is a user pointer but was not marked as __user. This makes sparse emit following warnings: CHECK fs/ioctl.c fs/ioctl.c:114:26: warning: incorrect type in argument 1 (different address spaces) fs/ioctl.c:114:26: expected void [noderef] *dst fs/ioctl.c:114:26: got struct fiemap_extent *[assigned] dest fs/ioctl.c:202:14: warning: incorrect type in argument 1 (different address spaces) fs/ioctl.c:202:14: expected void const volatile [noderef] * fs/ioctl.c:202:14: got struct fiemap_extent *[assigned] fi_extents_start fs/ioctl.c:212:27: warning: incorrect type in argument 1 (different address spaces) fs/ioctl.c:212:27: expected void [noderef] *dst fs/ioctl.c:212:27: got char * Also add 'ufiemap' variable to eliminate unnecessary casts. Signed-off-by: Namhyung Kim Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 09b5bd6..32b38cd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1483,8 +1483,8 @@ struct fiemap_extent_info { unsigned int fi_flags; /* Flags as passed from user */ unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent - * array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ }; int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, u64 phys, u64 len, u32 flags); -- cgit v1.1 From 2a8652f4e0d11ee27b1d2870c600fd1300661a6e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 3 Nov 2010 11:11:15 +0100 Subject: ecryptfs: moved ECRYPTFS_SUPER_MAGIC definition to linux/magic.h The definition of ECRYPTFS_SUPER_MAGIC has been moved to the include file 'linux/magic.h' to become available to other kernel subsystems. Signed-off-by: Roberto Sassu Signed-off-by: Tyler Hicks --- include/linux/magic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/magic.h b/include/linux/magic.h index ff690d0..62730ea 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -16,6 +16,7 @@ #define TMPFS_MAGIC 0x01021994 #define HUGETLBFS_MAGIC 0x958458f6 /* some random number */ #define SQUASHFS_MAGIC 0x73717368 +#define ECRYPTFS_SUPER_MAGIC 0xf15f #define EFS_SUPER_MAGIC 0x414A53 #define EXT2_SUPER_MAGIC 0xEF53 #define EXT3_SUPER_MAGIC 0xEF53 -- cgit v1.1 From d2763b4f44e16f44cc4156c9591e74df9dcd88be Mon Sep 17 00:00:00 2001 From: Naveen Kumar Gaddipati Date: Mon, 17 Jan 2011 20:40:58 -0800 Subject: Input: bu21013_ts - remove duplicate resolution parameters Remove duplicate display resolution parameters from platform data as one pair is quite enough. Signed-off-by: Naveen Kumar Gaddipati Signed-off-by: Dmitry Torokhov --- include/linux/input/bu21013.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/input/bu21013.h b/include/linux/input/bu21013.h index e470d38..05e0328 100644 --- a/include/linux/input/bu21013.h +++ b/include/linux/input/bu21013.h @@ -12,8 +12,6 @@ * @cs_en: pointer to the cs enable function * @cs_dis: pointer to the cs disable function * @irq_read_val: pointer to read the pen irq value function - * @x_max_res: xmax resolution - * @y_max_res: ymax resolution * @touch_x_max: touch x max * @touch_y_max: touch y max * @cs_pin: chip select pin @@ -29,8 +27,6 @@ struct bu21013_platform_device { int (*cs_en)(int reset_pin); int (*cs_dis)(int reset_pin); int (*irq_read_val)(void); - int x_max_res; - int y_max_res; int touch_x_max; int touch_y_max; unsigned int cs_pin; -- cgit v1.1 From b4e104eaeb8cd4329a23e0e4ebf166681b1d182d Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 17 Jan 2011 11:05:40 +0800 Subject: ACPICA: Update all ACPICA copyrights and signons to 2011 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acexcep.h | 2 +- include/acpi/acnames.h | 2 +- include/acpi/acoutput.h | 2 +- include/acpi/acpi.h | 2 +- include/acpi/acpiosxf.h | 2 +- include/acpi/acpixf.h | 2 +- include/acpi/acrestyp.h | 2 +- include/acpi/actbl.h | 2 +- include/acpi/actbl1.h | 2 +- include/acpi/actbl2.h | 2 +- include/acpi/actypes.h | 2 +- include/acpi/platform/acenv.h | 2 +- include/acpi/platform/acgcc.h | 2 +- include/acpi/platform/aclinux.h | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 17714be..5b6c391 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h index 9cf736e..fc1575f 100644 --- a/include/acpi/acnames.h +++ b/include/acpi/acnames.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h index bc4a6de..ef1cef7 100644 --- a/include/acpi/acoutput.h +++ b/include/acpi/acoutput.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h index a091cab..de39915 100644 --- a/include/acpi/acpi.h +++ b/include/acpi/acpi.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index 65b3f58..a3252a5 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -8,7 +8,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 241b8a0..2f38e29 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -6,7 +6,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h index e552635..0a66cc4 100644 --- a/include/acpi/acrestyp.h +++ b/include/acpi/acrestyp.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index ad20016..7e42bfe 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index c637b75..981349d 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index d4136b2..0fc15df 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 939a431..64f838b 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h index a3e334a..5af3ed5 100644 --- a/include/acpi/platform/acenv.h +++ b/include/acpi/platform/acenv.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index 5dcb953..e228893 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 572189e..5d2a5e9 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -5,7 +5,7 @@ *****************************************************************************/ /* - * Copyright (C) 2000 - 2010, Intel Corp. + * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without -- cgit v1.1 From 8d5f0a647395c1323787df675d2805cad54fc89f Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 17 Jan 2011 10:31:08 +0800 Subject: ACPICA: Update version to 20110112 Version 20110112. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 2f38e29..e46ec95 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20101209 +#define ACPI_CA_VERSION 0x20110112 #include "actypes.h" #include "actbl.h" -- cgit v1.1 From 9af39713feb53da96ba23fa94a73ffd0de50a815 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 18 Dec 2010 09:20:59 -0300 Subject: [media] saa7146: Convert from .ioctl to .unlocked_ioctl Convert saa7146 to use core-assisted locking. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/media/saa7146.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/saa7146.h b/include/media/saa7146.h index ac7ce00..7982714 100644 --- a/include/media/saa7146.h +++ b/include/media/saa7146.h @@ -115,7 +115,7 @@ struct saa7146_dev /* different device locks */ spinlock_t slock; - struct mutex lock; + struct mutex v4l2_lock; unsigned char __iomem *mem; /* pointer to mapped IO memory */ u32 revision; /* chip revision; needed for bug-workarounds*/ -- cgit v1.1 From 3c7c9370fb645f4713e0fbbe69425d8db9b47a13 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 8 Jan 2011 07:08:02 -0300 Subject: [media] v4l2-subdev: remove core.s_config and v4l2_i2c_new_subdev_cfg() The core.s_config op was meant for legacy drivers that needed to work with old pre-2.6.26 kernels. This is no longer relevant. Unfortunately, this op was incorrectly called from several drivers. Replace those occurences with proper i2c_board_info structs and call v4l2_i2c_new_subdev_board. After these changes v4l2_i2c_new_subdev_cfg() was no longer used, so remove that function as well. Signed-off-by: Hans Verkuil Acked-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/media/mt9v011.h | 17 +++++++++++++++++ include/media/v4l2-common.h | 13 +------------ include/media/v4l2-subdev.h | 6 +----- 3 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 include/media/mt9v011.h (limited to 'include') diff --git a/include/media/mt9v011.h b/include/media/mt9v011.h new file mode 100644 index 0000000..ea29fc7 --- /dev/null +++ b/include/media/mt9v011.h @@ -0,0 +1,17 @@ +/* mt9v011 sensor + * + * Copyright (C) 2011 Hans Verkuil + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MT9V011_H__ +#define __MT9V011_H__ + +struct mt9v011_platform_data { + unsigned xtal; /* Hz */ +}; + +#endif diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 2d65b35..a659319 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -138,21 +138,10 @@ struct v4l2_subdev_ops; /* Load an i2c module and return an initialized v4l2_subdev struct. The client_type argument is the name of the chip that's on the adapter. */ -struct v4l2_subdev *v4l2_i2c_new_subdev_cfg(struct v4l2_device *v4l2_dev, +struct v4l2_subdev *v4l2_i2c_new_subdev(struct v4l2_device *v4l2_dev, struct i2c_adapter *adapter, const char *client_type, - int irq, void *platform_data, u8 addr, const unsigned short *probe_addrs); -/* Load an i2c module and return an initialized v4l2_subdev struct. - The client_type argument is the name of the chip that's on the adapter. */ -static inline struct v4l2_subdev *v4l2_i2c_new_subdev(struct v4l2_device *v4l2_dev, - struct i2c_adapter *adapter, const char *client_type, - u8 addr, const unsigned short *probe_addrs) -{ - return v4l2_i2c_new_subdev_cfg(v4l2_dev, adapter, client_type, 0, NULL, - addr, probe_addrs); -} - struct i2c_board_info; struct v4l2_subdev *v4l2_i2c_new_subdev_board(struct v4l2_device *v4l2_dev, diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index b0316a7..42fbe46 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -106,10 +106,7 @@ struct v4l2_subdev_io_pin_config { u8 strength; /* Pin drive strength */ }; -/* s_config: if set, then it is always called by the v4l2_i2c_new_subdev* - functions after the v4l2_subdev was registered. It is used to pass - platform data to the subdev which can be used during initialization. - +/* s_io_pin_config: configure one or more chip I/O pins for chips that multiplex different internal signal pads out to IO pins. This function takes a pointer to an array of 'n' pin configuration entries, one for @@ -141,7 +138,6 @@ struct v4l2_subdev_io_pin_config { struct v4l2_subdev_core_ops { int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip); int (*log_status)(struct v4l2_subdev *sd); - int (*s_config)(struct v4l2_subdev *sd, int irq, void *platform_data); int (*s_io_pin_config)(struct v4l2_subdev *sd, size_t n, struct v4l2_subdev_io_pin_config *pincfg); int (*init)(struct v4l2_subdev *sd, u32 val); -- cgit v1.1 From 45f6f84af3ae9db19f39bc5d0976d626b0ef626e Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 8 Jan 2011 07:15:53 -0300 Subject: [media] v4l2-subdev: add (un)register internal ops Some subdevs need to call into the board code after they are registered and have a valid struct v4l2_device pointer. The s_config op was abused for this, but now that it is removed we need a cleaner way of solving this. So this patch adds a struct with internal ops that the v4l2 core can call. Currently only two ops exist: register and unregister. Subdevs can implement these to call the board code and pass it the v4l2_device pointer, which the board code can then use to get access to the struct that embeds the v4l2_device. It is expected that in the future open and close ops will also be added. Signed-off-by: Hans Verkuil Acked-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-subdev.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 42fbe46..daf1e57 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -411,6 +411,21 @@ struct v4l2_subdev_ops { const struct v4l2_subdev_sensor_ops *sensor; }; +/* + * Internal ops. Never call this from drivers, only the v4l2 framework can call + * these ops. + * + * registered: called when this subdev is registered. When called the v4l2_dev + * field is set to the correct v4l2_device. + * + * unregistered: called when this subdev is unregistered. When called the + * v4l2_dev field is still set to the correct v4l2_device. + */ +struct v4l2_subdev_internal_ops { + int (*registered)(struct v4l2_subdev *sd); + void (*unregistered)(struct v4l2_subdev *sd); +}; + #define V4L2_SUBDEV_NAME_SIZE 32 /* Set this flag if this subdev is a i2c device. */ @@ -427,6 +442,8 @@ struct v4l2_subdev { u32 flags; struct v4l2_device *v4l2_dev; const struct v4l2_subdev_ops *ops; + /* Never call these internal ops from within a driver! */ + const struct v4l2_subdev_internal_ops *internal_ops; /* The control handler of this subdev. May be NULL. */ struct v4l2_ctrl_handler *ctrl_handler; /* name must be unique */ -- cgit v1.1 From 2a863793beaa0fc9ee7aeb87efe85544a6b129c0 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 11 Jan 2011 14:45:03 -0300 Subject: [media] v4l2-ctrls: v4l2_ctrl_handler_setup must set is_new to 1 Renamed has_new to is_new. Drivers can use the is_new field to determine if a new value was specified for a control. The v4l2_ctrl_handler_setup() must always set this to 1 since the setup has to force a full update of all controls. Signed-off-by: Hans Verkuil Acked-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-ctrls.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index d69ab4a..fcc9a0c 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -53,8 +53,10 @@ struct v4l2_ctrl_ops { * @handler: The handler that owns the control. * @cluster: Point to start of cluster array. * @ncontrols: Number of controls in cluster array. - * @has_new: Internal flag: set when there is a valid new value. * @done: Internal flag: set for each processed control. + * @is_new: Set when the user specified a new value for this control. It + * is also set when called from v4l2_ctrl_handler_setup. Drivers + * should never set this flag. * @is_private: If set, then this control is private to its handler and it * will not be added to any other handlers. Drivers can set * this flag. @@ -97,9 +99,9 @@ struct v4l2_ctrl { struct v4l2_ctrl_handler *handler; struct v4l2_ctrl **cluster; unsigned ncontrols; - unsigned int has_new:1; unsigned int done:1; + unsigned int is_new:1; unsigned int is_private:1; unsigned int is_volatile:1; -- cgit v1.1 From 5aad724280b9f8ffff3a55311ef0ba35ebb4099a Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Thu, 6 Jan 2011 16:59:36 -0300 Subject: [media] rc: fix up and genericize some time unit conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ene_ir driver was using a private define of MS_TO_NS, which is meant to be microseconds to nanoseconds. The mceusb driver copied it, intending to use is a milliseconds to microseconds. Lets move the defines to a common location, expand and standardize them a touch, so that we now have: MS_TO_NS - milliseconds to nanoseconds MS_TO_US - milliseconds to microseconds US_TO_NS - microseconds to nanoseconds Reported-by: David Härdeman CC: Maxim Levitsky Signed-off-by: Jarod Wilson Signed-off-by: Mauro Carvalho Chehab --- include/media/rc-core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/media/rc-core.h b/include/media/rc-core.h index a23c1fc..2963263 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -183,6 +183,9 @@ static inline void init_ir_raw_event(struct ir_raw_event *ev) } #define IR_MAX_DURATION 0xFFFFFFFF /* a bit more than 4 seconds */ +#define US_TO_NS(usec) ((usec) * 1000) +#define MS_TO_US(msec) ((msec) * 1000) +#define MS_TO_NS(msec) ((msec) * 1000 * 1000) void ir_raw_event_handle(struct rc_dev *dev); int ir_raw_event_store(struct rc_dev *dev, struct ir_raw_event *ev); -- cgit v1.1 From 01c40c048b0f3f377e6d27b35fd99f04efcc21dd Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 19 Nov 2010 11:20:06 -0300 Subject: [media] v4l: Include linux/videodev2.h in media/v4l2-ctrls.h The later makes extensive use of structures defined in the former. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-ctrls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index fcc9a0c..97d0638 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -23,6 +23,7 @@ #include #include +#include /* forward references */ struct v4l2_ctrl_handler; -- cgit v1.1 From 765c2a964b49bd06b61a52991519281c85d82b67 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 19 Jan 2011 12:06:52 +0530 Subject: Bluetooth: Fix race condition with conn->sec_level The conn->sec_level value is supposed to represent the current level of security that the connection has. However, by assigning to it before requesting authentication it will have the wrong value during the authentication procedure. To fix this a pending_sec_level variable is added which is used to track the desired security level while making sure that sec_level always represents the current level of security. Signed-off-by: Johan Hedberg Signed-off-by: Gustavo F. Padovan --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a29feb0..d2cf884 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -184,6 +184,7 @@ struct hci_conn { __u32 link_mode; __u8 auth_type; __u8 sec_level; + __u8 pending_sec_level; __u8 power_save; __u16 disc_timeout; unsigned long pend; -- cgit v1.1 From 4580ccc04ddd8c17a470573a7fdb8def2e036dfa Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 18 Jan 2011 22:39:00 +0000 Subject: sctp: user perfect name for Delayed SACK Timer option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The option name of Delayed SACK Timer should be SCTP_DELAYED_SACK, not SCTP_DELAYED_ACK. Left SCTP_DELAYED_ACK be concomitant with SCTP_DELAYED_SACK, for making compatibility with existing applications. Reference: 8.1.19. Get or Set Delayed SACK Timer (SCTP_DELAYED_SACK) (http://tools.ietf.org/html/draft-ietf-tsvwg-sctpsocket-25) Signed-off-by: Shan Wei Acked-by: Wei Yongjun Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/user.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index 2b2769c..92eedc0 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -78,6 +78,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_GET_PEER_ADDR_INFO 15 #define SCTP_DELAYED_ACK_TIME 16 #define SCTP_DELAYED_ACK SCTP_DELAYED_ACK_TIME +#define SCTP_DELAYED_SACK SCTP_DELAYED_ACK_TIME #define SCTP_CONTEXT 17 #define SCTP_FRAGMENT_INTERLEAVE 18 #define SCTP_PARTIAL_DELIVERY_POINT 19 /* Set/Get partial delivery point */ -- cgit v1.1 From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:06:35 +0100 Subject: lockdep: Move early boot local IRQ enable/disable status to init/main.c During early boot, local IRQ is disabled until IRQ subsystem is properly initialized. During this time, no one should enable local IRQ and some operations which usually are not allowed with IRQ disabled, e.g. operations which might sleep or require communications with other processors, are allowed. lockdep tracked this with early_boot_irqs_off/on() callbacks. As other subsystems need this information too, move it to init/main.c and make it generally available. While at it, toggle the boolean to early_boot_irqs_disabled instead of enabled so that it can be initialized with %false and %true indicates the exceptional condition. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110635.GB6036@htj.dyndns.org> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 ++ include/linux/lockdep.h | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5a9d905..d07d805 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -243,6 +243,8 @@ extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags; +extern bool early_boot_irqs_disabled; + /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 71c09b26..f638fd7 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -436,16 +436,8 @@ do { \ #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_TRACE_IRQFLAGS -extern void early_boot_irqs_off(void); -extern void early_boot_irqs_on(void); extern void print_irqtrace_events(struct task_struct *curr); #else -static inline void early_boot_irqs_off(void) -{ -} -static inline void early_boot_irqs_on(void) -{ -} static inline void print_irqtrace_events(struct task_struct *curr) { } -- cgit v1.1 From ca3e021417eed30ec2b64ce88eb0acf64aa9bc29 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 20 Jan 2011 14:44:24 -0800 Subject: memcg: fix USED bit handling at uncharge in THP Now, under THP: at charge: - PageCgroupUsed bit is set to all page_cgroup on a hugepage. ....set to 512 pages. at uncharge - PageCgroupUsed bit is unset on the head page. So, some pages will remain with "Used" bit. This patch fixes that Used bit is set only to the head page. Used bits for tail pages will be set at splitting if necessary. This patch adds this lock order: compound_lock() -> page_cgroup_move_lock(). [akpm@linux-foundation.org: fix warning] Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a576f9..f512e18 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -146,6 +146,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); +#endif + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -335,6 +339,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem) return 0; } +static inline void mem_cgroup_split_huge_fixup(struct page *head, + struct page *tail) +{ +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.1 From 2d6d9fd3a54a28c6f67f26eb6c74803307a1b11e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 19 Jan 2011 22:27:14 +0100 Subject: ACPI: Introduce acpi_os_ioremap() Commit ca9b600be38c ("ACPI / PM: Make suspend_nvs_save() use acpi_os_map_memory()") attempted to prevent the code in osl.c and nvs.c from using different ioremap() variants by making the latter use acpi_os_map_memory() for mapping the NVS pages. However, that also requires acpi_os_unmap_memory() to be used for unmapping them, which causes synchronize_rcu() to be executed many times in a row unnecessarily and introduces substantial delays during resume on some systems. Instead of using acpi_os_map_memory() for mapping the NVS pages in nvs.c introduce acpi_os_ioremap() calling ioremap_cache() and make the code in both osl.c and nvs.c use it. Reported-by: Jeff Chua Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- include/linux/acpi.h | 3 --- include/linux/acpi_io.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 include/linux/acpi_io.h (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index eb176bb..a2e910e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -306,9 +306,6 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req); extern void acpi_early_init(void); -int acpi_os_map_generic_address(struct acpi_generic_address *addr); -void acpi_os_unmap_generic_address(struct acpi_generic_address *addr); - #else /* !CONFIG_ACPI */ #define acpi_disabled 1 diff --git a/include/linux/acpi_io.h b/include/linux/acpi_io.h new file mode 100644 index 0000000..7180013 --- /dev/null +++ b/include/linux/acpi_io.h @@ -0,0 +1,16 @@ +#ifndef _ACPI_IO_H_ +#define _ACPI_IO_H_ + +#include +#include + +static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, + acpi_size size) +{ + return ioremap_cache(phys, size); +} + +int acpi_os_map_generic_address(struct acpi_generic_address *addr); +void acpi_os_unmap_generic_address(struct acpi_generic_address *addr); + +#endif -- cgit v1.1 From 9190b3b3208d052d98cb601fcc192f3f71a5658b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jan 2011 23:31:33 -0800 Subject: net_sched: accurate bytes/packets stats/rates In commit 44b8288308ac9d (net_sched: pfifo_head_drop problem), we fixed a problem with pfifo_head drops that incorrectly decreased sch->bstats.bytes and sch->bstats.packets Several qdiscs (CHOKe, SFQ, pfifo_head, ...) are able to drop a previously enqueued packet, and bstats cannot be changed, so bstats/rates are not accurate (over estimated) This patch changes the qdisc_bstats updates to be done at dequeue() time instead of enqueue() time. bstats counters no longer account for dropped frames, and rates are more correct, since enqueue() bursts dont have effect on dequeue() rate. Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/sch_generic.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e9eee99..160a407 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -445,7 +445,6 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, { __skb_queue_tail(list, skb); sch->qstats.backlog += qdisc_pkt_len(skb); - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -460,8 +459,10 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, { struct sk_buff *skb = __skb_dequeue(list); - if (likely(skb != NULL)) + if (likely(skb != NULL)) { sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + } return skb; } @@ -474,10 +475,11 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct sk_buff_head *list) { - struct sk_buff *skb = __qdisc_dequeue_head(sch, list); + struct sk_buff *skb = __skb_dequeue(list); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); + sch->qstats.backlog -= len; kfree_skb(skb); return len; } -- cgit v1.1 From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2011 19:41:35 +0100 Subject: genirq: Remove __do_IRQ All architectures are finally converted. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Richard Henderson Cc: Mike Frysinger Cc: David Howells Cc: Tony Luck Cc: Greg Ungerer Cc: Michal Simek Acked-by: David Howells Cc: Kyle McMartin Acked-by: Benjamin Herrenschmidt Cc: Chen Liqin Cc: "David S. Miller" Cc: Chris Metcalf Cc: Jeff Dike --- include/linux/irqdesc.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 6a64c6f..c1a95b7 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -101,13 +101,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) #define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) /* - * Monolithic do_IRQ implementation. - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ -extern unsigned int __do_IRQ(unsigned int irq); -#endif - -/* * Architectures call this to let the generic IRQ layer * handle an interrupt. If the descriptor is attached to an * irqchip-style controller then we call the ->handle_irq() handler, @@ -115,14 +108,7 @@ extern unsigned int __do_IRQ(unsigned int irq); */ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) { -#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); -#else - if (likely(desc->handle_irq)) - desc->handle_irq(irq, desc); - else - __do_IRQ(irq); -#endif } static inline void generic_handle_irq(unsigned int irq) -- cgit v1.1 From 1daeddd5962acad1bea55e524fc0fadf32654a21 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 13 Jan 2011 09:30:49 -0800 Subject: rtc: Cleanup removed UIE emulation declaration rtc_dev_update_irq_enable_emul was removed in commit 042620a018afcfba1d678062b62e463b9e43a68d (UIE emulation is now handled via hrtimer), but the declaration was missed. This patch cleans it up. Signed-off-by: John Stultz LKML-Reference: <1294939849-20608-1-git-send-email-john.stultz@linaro.org> Signed-off-by: Thomas Gleixner --- include/linux/rtc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 3c995b4..ea76069 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -235,8 +235,6 @@ extern int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq); extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled); -extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, - unsigned int enabled); void rtc_aie_update_irq(void *private); void rtc_uie_update_irq(void *private); -- cgit v1.1 From aa0be0f4659f91f31e45adc422b1788cb36ffddc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 20 Jan 2011 15:26:12 -0800 Subject: RTC: Propagate error handling via rtc_timer_enqueue properly In cases where RTC hardware does not support alarms, the virtualized RTC interfaces did not have a way to propagate the error up to userland. This patch extends rtc_timer_enqueue so it catches errors from the hardware and returns them upwards to the virtualized interfaces. To simplify error handling, it also internalizes the management of the timer->enabled bit into rtc_timer_enqueue and rtc_timer_remove. Also makes rtc_timer_enqueue and rtc_timer_remove static. Reported-by: David Daney Reported-by: Andreas Schwab Reported-by: Geert Uytterhoeven Diagnosed-by: David Daney Tested-by: David Daney Signed-off-by: John Stultz LKML-Reference: <1295565973-14358-1-git-send-email-john.stultz@linaro.org> Signed-off-by: Thomas Gleixner --- include/linux/rtc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index ea76069..a0b639f 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -244,8 +244,6 @@ int rtc_register(rtc_task_t *task); int rtc_unregister(rtc_task_t *task); int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); -void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer); -void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data); int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, ktime_t expires, ktime_t period); -- cgit v1.1 From 3dece370ecc7c6152b3fdd21c6de28179f6e643d Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Fri, 21 Jan 2011 08:49:56 +0100 Subject: mm: System without MMU do not need pte_mkwrite The patch "thp: export maybe_mkwrite" (commit 14fd403f2146) breaks systems without MMU. Error log: CC arch/microblaze/mm/init.o In file included from include/linux/mman.h:14, from arch/microblaze/mm/consistent.c:24: include/linux/mm.h: In function 'maybe_mkwrite': include/linux/mm.h:482: error: implicit declaration of function 'pte_mkwrite' include/linux/mm.h:482: error: incompatible types in assignment Signed-off-by: Michal Simek CC: Andrea Arcangeli Reviewed-by: Rik van Riel CC: Andrew Morton CC: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 956a355..f6385fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -470,6 +470,7 @@ static inline void set_compound_order(struct page *page, unsigned long order) page[1].lru.prev = (void *)order; } +#ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want @@ -482,6 +483,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) pte = pte_mkwrite(pte); return pte; } +#endif /* * Multiple processes may "see" the same page. E.g. for untouched -- cgit v1.1 From d14fc1a74e846d7851f24fc9519fe87dc12a1231 Mon Sep 17 00:00:00 2001 From: Libor Pechacek Date: Fri, 14 Jan 2011 14:30:21 +0100 Subject: USB: serial: handle Data Carrier Detect changes Alan's commit 335f8514f200e63d689113d29cb7253a5c282967 introduced .carrier_raised function in several drivers. That also means tty_port_block_til_ready can now suspend the process trying to open the serial port when Carrier Detect is low and put it into tty_port.open_wait queue. We need to wake up the process when Carrier Detect goes high and trigger TTY hangup when CD goes low. Some of the devices do not report modem status line changes, or at least we don't understand the status message, so for those we remove .carrier_raised again. Signed-off-by: Libor Pechacek Cc: stable Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/serial.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 16d682f..c904913 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -347,6 +347,9 @@ extern int usb_serial_generic_prepare_write_buffer(struct usb_serial_port *port, extern int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch); extern int usb_serial_handle_break(struct usb_serial_port *port); +extern void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port, + struct tty_struct *tty, + unsigned int status); extern int usb_serial_bus_register(struct usb_serial_driver *device); -- cgit v1.1 From e94965ed5beb23c6fabf7ed31f625e66d7ff28de Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 15 Dec 2010 14:00:19 -0800 Subject: module: show version information for built-in modules in sysfs Currently only drivers that are built as modules have their versions shown in /sys/module//version, but this information might also be useful for built-in drivers as well. This especially important for drivers that do not define any parameters - such drivers, if built-in, are completely invisible from userspace. This patch changes MODULE_VERSION() macro so that in case when we are compiling built-in module, version information is stored in a separate section. Kernel then uses this data to create 'version' sysfs attribute in the same fashion it creates attributes for module parameters. Signed-off-by: Dmitry Torokhov Signed-off-by: Rusty Russell --- include/asm-generic/vmlinux.lds.h | 7 +++++++ include/linux/module.h | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6864933..6ebb810 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -364,6 +364,13 @@ VMLINUX_SYMBOL(__start___param) = .; \ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ + } \ + \ + /* Built-in module versions. */ \ + __modver : AT(ADDR(__modver) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start___modver) = .; \ + *(__modver) \ + VMLINUX_SYMBOL(__stop___modver) = .; \ . = ALIGN((align)); \ VMLINUX_SYMBOL(__end_rodata) = .; \ } \ diff --git a/include/linux/module.h b/include/linux/module.h index 8b17fd8..50efcd3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -58,6 +58,12 @@ struct module_attribute { void (*free)(struct module *); }; +struct module_version_attribute { + struct module_attribute mattr; + const char *module_name; + const char *version; +}; + struct module_kobject { struct kobject kobj; @@ -161,7 +167,28 @@ extern struct module __this_module; Using this automatically adds a checksum of the .c files and the local headers in "srcversion". */ + +#ifdef MODULE #define MODULE_VERSION(_version) MODULE_INFO(version, _version) +#else +#define MODULE_VERSION(_version) \ + extern ssize_t __modver_version_show(struct module_attribute *, \ + struct module *, char *); \ + static struct module_version_attribute __modver_version_attr \ + __used \ + __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \ + = { \ + .mattr = { \ + .attr = { \ + .name = "version", \ + .mode = S_IRUGO, \ + }, \ + .show = __modver_version_show, \ + }, \ + .module_name = KBUILD_MODNAME, \ + .version = _version, \ + } +#endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware -- cgit v1.1 From 3b90a5b292321b2acac3921f77046ae195aef53f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Jan 2011 14:32:51 -0600 Subject: module: fix linker error for MODULE_VERSION when !MODULE and CONFIG_SYSFS=n lib/built-in.o:(__modver+0x8): undefined reference to `__modver_version_show' lib/built-in.o:(__modver+0x2c): undefined reference to `__modver_version_show' Simplest to just not emit anything: if they've disabled SYSFS they probably want the smallest kernel possible. Reported-by: Randy Dunlap Signed-off-by: Rusty Russell --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 50efcd3..e7c6385 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -168,7 +168,7 @@ extern struct module __this_module; local headers in "srcversion". */ -#ifdef MODULE +#if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ -- cgit v1.1 From b75be4204e7871869b2c268c00783703197aaa7d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 5 Jan 2011 13:27:04 +0100 Subject: param: add null statement to compiled-in module params Add an unused struct declaration statement requiring a terminating semicolon to the compile-in case to provoke an error if __MODULE_INFO() is used without the terminating semicolon. Previously MODULE_ALIAS("foo") (no semicolon) compiled fine if MODULE was not selected. Cc: Dan Carpenter Signed-off-by: Linus Walleij Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 112adf8..07b4195 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -16,15 +16,17 @@ /* Chosen so that structs with an unsigned long line up. */ #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) -#ifdef MODULE #define ___module_cat(a,b) __mod_ ## a ## b #define __module_cat(a,b) ___module_cat(a,b) +#ifdef MODULE #define __MODULE_INFO(tag, name, info) \ static const char __module_cat(name,__LINE__)[] \ __used __attribute__((section(".modinfo"), unused, aligned(1))) \ = __stringify(tag) "=" info #else /* !MODULE */ -#define __MODULE_INFO(tag, name, info) +/* This struct is here for syntactic coherency, it is not used */ +#define __MODULE_INFO(tag, name, info) \ + struct __module_cat(name,__LINE__) {} #endif #define __MODULE_PARM_TYPE(name, _type) \ __MODULE_INFO(parmtype, name##type, #name ":" _type) -- cgit v1.1 From 7ef88ad561457c0346355dfd1f53e503ddfde719 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Jan 2011 14:45:10 -0600 Subject: BUILD_BUG_ON: make it handle more cases BUILD_BUG_ON used to use the optimizer to do code elimination or fail at link time; it was changed to first the size of a negative array (a nicer compile time error), then (in 8c87df457cb58fe75b9b893007917cf8095660a0) to a bitfield. This forced us to change some non-constant cases to MAYBE_BUILD_BUG_ON(); as Jan points out in that commit, it didn't work as intended anyway. bitfields: needs a literal constant at parse time, and can't be put under "if (__builtin_constant_p(x))" for example. negative array: can handle anything, but if the compiler can't tell it's a constant, silently has no effect. link time: breaks link if the compiler can't determine the value, but the linker output is not usually as informative as a compiler error. If we use the negative-array-size method *and* the link time trick, we get the ability to use BUILD_BUG_ON() under __builtin_constant_p() branches, and maximal ability for the compiler to detect errors at build time. We also document it thoroughly. Signed-off-by: Rusty Russell Cc: Jan Beulich Acked-by: Hollis Blanchard --- include/linux/kernel.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d07d805..864712f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -575,12 +575,6 @@ struct sysinfo { char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */ }; -/* Force a compilation error if condition is true */ -#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) - -/* Force a compilation error if condition is constant and true */ -#define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)])) - /* Force a compilation error if a constant expression is not a power of 2 */ #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) @@ -592,6 +586,33 @@ struct sysinfo { #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @cond: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + * + * The implementation uses gcc's reluctance to create a negative array, but + * gcc (as of 4.4) only emits that error for obvious cases (eg. not arguments + * to inline functions). So as a fallback we use the optimizer; if it can't + * prove the condition is false, it will cause a link error on the undefined + * "__build_bug_on_failed". This error message can be harder to track down + * though, hence the two different methods. + */ +#ifndef __OPTIMIZE__ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +extern int __build_bug_on_failed; +#define BUILD_BUG_ON(condition) \ + do { \ + ((void)sizeof(char[1 - 2*!!(condition)])); \ + if (condition) __build_bug_on_failed = 1; \ + } while(0) +#endif +#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition) + /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) -- cgit v1.1 From 1765e3a4933ea0870fabd755feffc5473c4363ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 24 Jan 2011 14:45:10 -0600 Subject: Remove MAYBE_BUILD_BUG_ON Now BUILD_BUG_ON() can handle optimizable constants, we don't need MAYBE_BUILD_BUG_ON any more. Signed-off-by: Rusty Russell --- include/linux/gfp.h | 2 +- include/linux/kernel.h | 1 - include/linux/kmemcheck.h | 2 +- include/linux/virtio_config.h | 5 ++++- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a3b148a..0b84c61 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -249,7 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags) ((1 << ZONES_SHIFT) - 1); if (__builtin_constant_p(bit)) - MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); else { #ifdef CONFIG_DEBUG_VM BUG_ON((GFP_ZONE_BAD >> bit) & 1); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 864712f..e2f4d6a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -611,7 +611,6 @@ extern int __build_bug_on_failed; if (condition) __build_bug_on_failed = 1; \ } while(0) #endif -#define MAYBE_BUILD_BUG_ON(condition) BUILD_BUG_ON(condition) /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 08d7dc4..39f8453 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -76,7 +76,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); \ _n = (long) &((ptr)->name##_end) \ - (long) &((ptr)->name##_begin); \ - MAYBE_BUILD_BUG_ON(_n < 0); \ + BUILD_BUG_ON(_n < 0); \ \ kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ } while (0) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 0093dd7..800617b 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -109,7 +109,10 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, unsigned int fbit) { /* Did you forget to fix assumptions on max features? */ - MAYBE_BUILD_BUG_ON(fbit >= 32); + if (__builtin_constant_p(fbit)) + BUILD_BUG_ON(fbit >= 32); + else + BUG_ON(fbit >= 32); if (fbit < VIRTIO_TRANSPORT_F_START) virtio_check_driver_offered_feature(vdev, fbit); -- cgit v1.1 From 8c6a98b22b750c9eb52653ba643faa17db8d3881 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 24 Jan 2011 09:31:38 -0800 Subject: Input: sysrq - ensure sysrq_enabled and __sysrq_enabled are consistent Currently sysrq_enabled and __sysrq_enabled are initialised separately and inconsistently, leading to sysrq being actually enabled by reported as not enabled in sysfs. The first change to the sysfs configurable synchronises these two: static int __read_mostly sysrq_enabled = 1; static int __sysrq_enabled; Add a common define to carry the default for these preventing them becoming out of sync again. Default this to 1 to mirror previous behaviour. Signed-off-by: Andy Whitcroft Cc: stable@kernel.org Signed-off-by: Dmitry Torokhov --- include/linux/sysrq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 387fa7d..7faf933 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -17,6 +17,9 @@ #include #include +/* Enable/disable SYSRQ support by default (0==no, 1==yes). */ +#define SYSRQ_DEFAULT_ENABLE 1 + /* Possible values of bitmask for enabling sysrq functions */ /* 0x0001 is reserved for enable everything */ #define SYSRQ_ENABLE_LOG 0x0002 -- cgit v1.1 From ac61c46f4f7665ab4548e90430c37b2529e16cff Mon Sep 17 00:00:00 2001 From: David Dillow Date: Sun, 16 Jan 2011 15:12:39 -0500 Subject: [SCSI] fix incorrect value of SCSI_MAX_SG_CHAIN_SEGMENTS due to include file ordering If the compiled object doesn't include linux/scatterlist.h before scsi/scsi.h, it will get an incorrect definition of SCSI_MAX_SG_CHAIN_SEGMENTS. Signed-off-by: David Dillow Cc: stable@kernel.org Signed-off-by: James Bottomley --- include/scsi/scsi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 648d233..b76d400 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -9,6 +9,7 @@ #define _SCSI_SCSI_H #include +#include struct scsi_cmnd; -- cgit v1.1 From 58bbf018a70c562437eeae121a5d021ba7fe56a5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 24 Jan 2011 17:14:26 -0500 Subject: drm/radeon/kms: add new radeon_info ioctl query for clock crystal freq Needed for timer queries in the 3D driver. Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- include/drm/radeon_drm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h index e95a86b..e5c607a 100644 --- a/include/drm/radeon_drm.h +++ b/include/drm/radeon_drm.h @@ -907,6 +907,7 @@ struct drm_radeon_cs { #define RADEON_INFO_TILING_CONFIG 0x06 #define RADEON_INFO_WANT_HYPERZ 0x07 #define RADEON_INFO_WANT_CMASK 0x08 /* get access to CMASK on r300 */ +#define RADEON_INFO_CLOCK_CRYSTAL_FREQ 0x09 /* clock crystal frequency */ struct drm_radeon_info { uint32_t request; -- cgit v1.1 From eb03355660b44cf6b1ed2f895085b9de8f74efbc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 24 Jan 2011 15:11:08 +0000 Subject: drm: Add an interface to reset the device Iterate over the attached CRTCs, encoders and connectors and call the supplied reset vfunc in order to reset any cached state back to unknown. Useful after an invalidation event such as a GPU reset or resuming. Tested-by: Takashi Iwai Signed-off-by: Chris Wilson --- include/drm/drm_crtc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index acd7fad..801be59 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -275,6 +275,7 @@ struct drm_pending_vblank_event; /** * drm_crtc_funcs - control CRTCs for a given device + * @reset: reset CRTC after state has been invalidate (e.g. resume) * @dpms: control display power levels * @save: save CRTC state * @resore: restore CRTC state @@ -302,6 +303,8 @@ struct drm_crtc_funcs { void (*save)(struct drm_crtc *crtc); /* suspend? */ /* Restore CRTC state */ void (*restore)(struct drm_crtc *crtc); /* resume? */ + /* Reset CRTC state */ + void (*reset)(struct drm_crtc *crtc); /* cursor controls */ int (*cursor_set)(struct drm_crtc *crtc, struct drm_file *file_priv, @@ -379,6 +382,7 @@ struct drm_crtc { * @dpms: set power state (see drm_crtc_funcs above) * @save: save connector state * @restore: restore connector state + * @reset: reset connector after state has been invalidate (e.g. resume) * @mode_valid: is this mode valid on the given connector? * @mode_fixup: try to fixup proposed mode for this connector * @mode_set: set this mode @@ -396,6 +400,7 @@ struct drm_connector_funcs { void (*dpms)(struct drm_connector *connector, int mode); void (*save)(struct drm_connector *connector); void (*restore)(struct drm_connector *connector); + void (*reset)(struct drm_connector *connector); /* Check to see if anything is attached to the connector. * @force is set to false whilst polling, true when checking the @@ -413,6 +418,7 @@ struct drm_connector_funcs { }; struct drm_encoder_funcs { + void (*reset)(struct drm_encoder *encoder); void (*destroy)(struct drm_encoder *encoder); }; @@ -656,6 +662,7 @@ extern struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev, struct drm_display_mode *mode); extern void drm_mode_debug_printmodeline(struct drm_display_mode *mode); extern void drm_mode_config_init(struct drm_device *dev); +extern void drm_mode_config_reset(struct drm_device *dev); extern void drm_mode_config_cleanup(struct drm_device *dev); extern void drm_mode_set_name(struct drm_display_mode *mode); extern bool drm_mode_equal(struct drm_display_mode *mode1, struct drm_display_mode *mode2); -- cgit v1.1 From 731f3f482ad3b2c58a1af2d0a9a634a82803706a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 03:05:28 +0000 Subject: NFS: nfsacl_{encode,decode} should return signed integer Clean up. The nfsacl_encode() and nfsacl_decode() functions return negative errno values, and each call site verifies that the returned value is not negative. Change the synopsis of both of these functions to reflect this usage. Document the synopsis and return values. Reported-by: Trond Myklebust Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/nfsacl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h index f321b57..fabcb1e 100644 --- a/include/linux/nfsacl.h +++ b/include/linux/nfsacl.h @@ -51,10 +51,10 @@ nfsacl_size(struct posix_acl *acl_access, struct posix_acl *acl_default) return w; } -extern unsigned int +extern int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, struct posix_acl *acl, int encode_entries, int typeflag); -extern unsigned int +extern int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, struct posix_acl **pacl); -- cgit v1.1 From f61f6da0d53842e849bab7f69e1431bd3de1136d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Jan 2011 03:05:38 +0000 Subject: NFS: Prevent memory allocation failure in nfsacl_encode() nfsacl_encode() allocates memory in certain cases. This of course is not guaranteed to work. Since commit 9f06c719 "SUNRPC: New xdr_streams XDR encoder API", the kernel's XDR encoders can't return a result indicating possibly a failure, so a memory allocation failure in nfsacl_encode() has become fatal (ie, the XDR code Oopses) in some cases. However, the allocated memory is a tiny fixed amount, on the order of 40-50 bytes. We can easily use a stack-allocated buffer for this, with only a wee bit of nose-holding. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/posix_acl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index d68283a..54211c1 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -71,6 +71,7 @@ posix_acl_release(struct posix_acl *acl) /* posix_acl.c */ +extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); extern struct posix_acl *posix_acl_clone(const struct posix_acl *, gfp_t); extern int posix_acl_valid(const struct posix_acl *); -- cgit v1.1 From 778be232a207e79088ba70d832ac25dfea6fbf1a Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 25 Jan 2011 15:38:01 +0000 Subject: NFS do not find client in NFSv4 pg_authenticate The information required to find the nfs_client cooresponding to the incoming back channel request is contained in the NFS layer. Perform minimal checking in the RPC layer pg_authenticate method, and push more detailed checking into the NFS layer where the nfs_client can be found. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- include/linux/sunrpc/bc_xprt.h | 13 ------------- include/linux/sunrpc/svc_xprt.h | 1 - 2 files changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index c50b458..0828842 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -47,14 +47,6 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp) return 1; return 0; } -static inline struct nfs4_sessionid *bc_xprt_sid(struct svc_rqst *rqstp) -{ - if (svc_is_backchannel(rqstp)) - return (struct nfs4_sessionid *) - rqstp->rq_server->sv_bc_xprt->xpt_bc_sid; - return NULL; -} - #else /* CONFIG_NFS_V4_1 */ static inline int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) @@ -67,11 +59,6 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp) return 0; } -static inline struct nfs4_sessionid *bc_xprt_sid(struct svc_rqst *rqstp) -{ - return NULL; -} - static inline void xprt_free_bc_request(struct rpc_rqst *req) { } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 059877b..7ad9751 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -77,7 +77,6 @@ struct svc_xprt { size_t xpt_remotelen; /* length of address */ struct rpc_wait_queue xpt_bc_pending; /* backchannel wait queue */ struct list_head xpt_users; /* callbacks on free */ - void *xpt_bc_sid; /* back channel session ID */ struct net *xpt_net; struct rpc_xprt *xpt_bc_xprt; /* NFSv4.1 backchannel */ -- cgit v1.1 From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Tue, 25 Jan 2011 15:07:35 -0800 Subject: console: rename acquire/release_console_sem() to console_lock/unlock() The -rt patches change the console_semaphore to console_mutex. As a result, a quite large chunk of the patches changes all acquire/release_console_sem() to acquire/release_console_mutex() This commit makes things use more neutral function names which dont make implications about the underlying lock. The only real change is the return value of console_trylock which is inverted from try_acquire_console_sem() This patch also paves the way to switching console_sem from a semaphore to a mutex. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert] Signed-off-by: Torben Hohn Cc: Thomas Gleixner Cc: Greg KH Cc: Ingo Molnar Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 9774fe6..7453cfd 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -139,9 +139,9 @@ extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_n extern void register_console(struct console *); extern int unregister_console(struct console *); extern struct console *console_drivers; -extern void acquire_console_sem(void); -extern int try_acquire_console_sem(void); -extern void release_console_sem(void); +extern void console_lock(void); +extern int console_trylock(void); +extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern struct tty_driver *console_device(int *); -- cgit v1.1 From 52fe7c9cc1637110ba4e0e6fe5d07cc0786d62de Mon Sep 17 00:00:00 2001 From: "sjur.brandeland@stericsson.com" Date: Sat, 29 Jan 2011 13:10:37 +0000 Subject: caif: bugfix - add caif headers for userspace usage. Add caif_socket.h and if_caif.h to the kernel header files exported for use by userspace. Signed-off-by: Sjur Braendeland Signed-off-by: David S. Miller --- include/linux/Kbuild | 1 + include/linux/caif/Kbuild | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 include/linux/caif/Kbuild (limited to 'include') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 2296d8b..b0ada6f 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -1,5 +1,6 @@ header-y += byteorder/ header-y += can/ +header-y += caif/ header-y += dvb/ header-y += hdlc/ header-y += isdn/ diff --git a/include/linux/caif/Kbuild b/include/linux/caif/Kbuild new file mode 100644 index 0000000..a9cf250 --- /dev/null +++ b/include/linux/caif/Kbuild @@ -0,0 +1,2 @@ +header-y += caif_socket.h +header-y += if_caif.h -- cgit v1.1 From 709b46e8d90badda1898caea50483c12af178e96 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 29 Jan 2011 16:15:56 +0000 Subject: net: Add compat ioctl support for the ipv4 multicast ioctl SIOCGETSGCNT SIOCGETSGCNT is not a unique ioctl value as it it maps tio SIOCPROTOPRIVATE +1, which unfortunately means the existing infrastructure for compat networking ioctls is insufficient. A trivial compact ioctl implementation would conflict with: SIOCAX25ADDUID SIOCAIPXPRISLT SIOCGETSGCNT_IN6 SIOCGETSGCNT SIOCRSSCAUSE SIOCX25SSUBSCRIP SIOCX25SDTEFACILITIES To make this work I have updated the compat_ioctl decode path to mirror the the normal ioctl decode path. I have added an ipv4 inet_compat_ioctl function so that I can have ipv4 specific compat ioctls. I have added a compat_ioctl function into struct proto so I can break out ioctls by which kind of ip socket I am using. I have added a compat_raw_ioctl function because SIOCGETSGCNT only works on raw sockets. I have added a ipmr_compat_ioctl that mirrors the normal ipmr_ioctl. This was necessary because unfortunately the struct layout for the SIOCGETSGCNT has unsigned longs in it so changes between 32bit and 64bit kernels. This change was sufficient to run a 32bit ip multicast routing daemon on a 64bit kernel. Reported-by: Bill Fenner Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/mroute.h | 1 + include/net/sock.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 0fa7a3a..b21d567 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -150,6 +150,7 @@ static inline int ip_mroute_opt(int opt) extern int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); +extern int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip_mr_init(void); #else static inline diff --git a/include/net/sock.h b/include/net/sock.h index d884d26..bc1cf7d8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -753,6 +753,8 @@ struct proto { int level, int optname, char __user *optval, int __user *option); + int (*compat_ioctl)(struct sock *sk, + unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); -- cgit v1.1 From 78c6e170badd22c86a5b50a7eb038a02024b8f03 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 31 Jan 2011 10:48:04 +0000 Subject: drm/i915: Suppress spurious vblank interrupts Hugh Dickins found that characters in xterm were going missing and oft delayed. Being the curious type, he managed to associate this with the new high-precision vblank patches; disabling these he found, restored the orderliness of his characters. The oddness begins when one realised that Hugh was not using vblanks at all on his system (fvwm and some xterms). Instead, all he had to go on were warning of a pipe underrun, curiously enough at around 60Hz. He poked and found that in addition to the underrun warning, the hardware was flagging the start of a new frame, a vblank, which in turn was kicking off the pending vblank processing code. There is little we can do for the underruns on Hugh's machine, a Crestline [965GM], which must have its FIFO watermarks set to 8. However, we do not need to process the vblank if we know that they are disabled... Reported-by: Hugh Dickins Signed-off-by: Chris Wilson --- include/drm/drmP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drmP.h b/include/drm/drmP.h index a4694c6..fe29aad 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -1367,7 +1367,7 @@ extern int drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq); extern u32 drm_vblank_count(struct drm_device *dev, int crtc); extern u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc, struct timeval *vblanktime); -extern void drm_handle_vblank(struct drm_device *dev, int crtc); +extern bool drm_handle_vblank(struct drm_device *dev, int crtc); extern int drm_vblank_get(struct drm_device *dev, int crtc); extern void drm_vblank_put(struct drm_device *dev, int crtc); extern void drm_vblank_off(struct drm_device *dev, int crtc); -- cgit v1.1 From ffbbf2da9e578dc7b7ae4f945412c4b74f54b20e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 31 Jan 2011 09:29:14 -0800 Subject: kernel.h: fix kernel-doc warning Fix kernel-doc warning in kernel.h from commit 7ef88ad56145 ("BUILD_BUG_ON: make it handle more cases"): Warning(include/linux/kernel.h:605): No description found for parameter 'condition' Warning(include/linux/kernel.h:605): Excess function parameter 'cond' description in 'BUILD_BUG_ON' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e2f4d6a..2fe6e84 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -588,7 +588,7 @@ struct sysinfo { /** * BUILD_BUG_ON - break compile if a condition is true. - * @cond: the condition which the compiler should know is false. + * @condition: the condition which the compiler should know is false. * * If you have some code which relies on certain constants being equal, or * other compile-time-evaluated condition, you should use BUILD_BUG_ON to -- cgit v1.1 From 3db7e93d3308fb882884b9f024235d6fbf542034 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 1 Feb 2011 16:06:30 +0100 Subject: netfilter: ecache: always set events bits, filter them later For the following rule: iptables -I PREROUTING -t raw -j CT --ctevents assured The event delivered looks like the following: [UPDATE] tcp 6 src=192.168.0.2 dst=192.168.1.2 sport=37041 dport=80 src=192.168.1.2 dst=192.168.1.100 sport=80 dport=37041 [ASSURED] Note that the TCP protocol state is not included. For that reason the CT event filtering is not very useful for conntrackd. To resolve this issue, instead of conditionally setting the CT events bits based on the ctmask, we always set them and perform the filtering in the late stage, just before the delivery. Thus, the event delivered looks like the following: [UPDATE] tcp 6 432000 ESTABLISHED src=192.168.0.2 dst=192.168.1.2 sport=37041 dport=80 src=192.168.1.2 dst=192.168.1.100 sport=80 dport=37041 [ASSURED] Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_ecache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 96ba5f7..349cefe 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -77,9 +77,6 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; - if (!(e->ctmask & (1 << event))) - return; - set_bit(event, &e->cache); } -- cgit v1.1 From 63a507800c8aca5a1891d598ae13f829346e8e39 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 1 Feb 2011 19:06:46 -0500 Subject: drm/radeon: remove 0x4243 pci id 0x4243 is a PCI bridge, not a GPU. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=33815 Signed-off-by: Alex Deucher Cc: stable@kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index fe29ae3..5ff1194 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -28,7 +28,6 @@ {0x1002, 0x4156, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV350}, \ {0x1002, 0x4237, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP}, \ {0x1002, 0x4242, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R200}, \ - {0x1002, 0x4243, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R200}, \ {0x1002, 0x4336, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS100|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ {0x1002, 0x4337, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ {0x1002, 0x4437, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ -- cgit v1.1 From 19942822df65ee4a47c2e6d6d70cace1b7f01710 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:43 -0800 Subject: memcg: prevent endless loop when charging huge pages to near-limit group If reclaim after a failed charging was unsuccessful, the limits are checked again, just in case they settled by means of other tasks. This is all fine as long as every charge is of size PAGE_SIZE, because in that case, being below the limit means having at least PAGE_SIZE bytes available. But with transparent huge pages, we may end up in an endless loop where charging and reclaim fail, but we keep going because the limits are not yet exceeded, although not allowing for a huge page. Fix this up by explicitely checking for enough room, not just whether we are within limits. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index fcb9884..a5930cb 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -182,6 +182,26 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt) return ret; } +/** + * res_counter_check_margin - check if the counter allows charging + * @cnt: the resource counter to check + * @bytes: the number of bytes to check the remaining space against + * + * Returns a boolean value on whether the counter can be charged + * @bytes or whether this would exceed the limit. + */ +static inline bool res_counter_check_margin(struct res_counter *cnt, + unsigned long bytes) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->limit - cnt->usage >= bytes; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt) { bool ret; -- cgit v1.1 From 1a44bc8c7cfe69756a116d38aef992d50fc1969d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Feb 2011 15:52:46 -0800 Subject: vfs: sparse: remove a warning on OPEN_FMODE() AND-ing FMODE_* constant with normal integer results in following sparse warnings. Fix it. fs/open.c:662:21: warning: restricted fmode_t degrades to integer fs/anon_inodes.c:123:34: warning: restricted fmode_t degrades to integer Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 32b38cd..675678a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2555,9 +2555,11 @@ int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); +#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) + #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & FMODE_NONOTIFY))) + (flag & __FMODE_NONOTIFY))) #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ -- cgit v1.1 From 3cd90ea42f2c15f928b70ed66f6d8ed0a8e7aadd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 1 Feb 2011 15:52:46 -0800 Subject: vfs: sparse: add __FMODE_EXEC FMODE_EXEC is a constant type of fmode_t but was used with normal integer constants. This results in following warnings from sparse. Fix it using new macro __FMODE_EXEC. fs/exec.c:116:58: warning: restricted fmode_t degrades to integer fs/exec.c:689:58: warning: restricted fmode_t degrades to integer fs/fcntl.c:777:9: warning: restricted fmode_t degrades to integer Signed-off-by: Namhyung Kim Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 675678a..bd32159 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2555,6 +2555,7 @@ int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); +#define __FMODE_EXEC ((__force int) FMODE_EXEC) #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -- cgit v1.1 From e4a9ea5ee7c8812a7bf0c3fb725ceeaa3d4c2fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 09:15:30 -0500 Subject: tracing: Replace trace_event struct array with pointer array Currently the trace_event structures are placed in the _ftrace_events section, and at link time, the linker makes one large array of all the trace_event structures. On boot up, this array is read (much like the initcall sections) and the events are processed. The problem is that there is no guarantee that gcc will place complex structures nicely together in an array format. Two structures in the same file may be placed awkwardly, because gcc has no clue that they are suppose to be in an array. A hack was used previous to force the alignment to 4, to pack the structures together. But this caused alignment issues with other architectures (sparc). Instead of packing the structures into an array, the structures' addresses are now put into the _ftrace_event section. As pointers are always the natural alignment, gcc should always pack them tightly together (otherwise initcall, extable, etc would also fail). By having the pointers to the structures in the section, we can still iterate the trace_events without causing unnecessary alignment problems with other architectures, or depending on the current behaviour of gcc that will likely change in the future just to tick us kernel developers off a little more. The _ftrace_event section is also moved into the .init.data section as it is now only needed at boot up. Suggested-by: David Miller Cc: Mathieu Desnoyers Acked-by: David S. Miller Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 7 +++---- include/linux/module.h | 2 +- include/linux/syscalls.h | 10 ++++++---- include/trace/ftrace.h | 24 +++++++++++++----------- 4 files changed, 23 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6ebb810..f53708b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -124,7 +124,8 @@ #endif #ifdef CONFIG_EVENT_TRACING -#define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ +#define FTRACE_EVENTS() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; #else @@ -179,9 +180,6 @@ TRACE_PRINTKS() \ \ STRUCT_ALIGN(); \ - FTRACE_EVENTS() \ - \ - STRUCT_ALIGN(); \ TRACE_SYSCALLS() /* @@ -482,6 +480,7 @@ KERNEL_CTORS() \ *(.init.rodata) \ MCOUNT_REC() \ + FTRACE_EVENTS() \ DEV_DISCARD(init.rodata) \ CPU_DISCARD(init.rodata) \ MEM_DISCARD(init.rodata) \ diff --git a/include/linux/module.h b/include/linux/module.h index e7c6385..7695a30 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -389,7 +389,7 @@ struct module unsigned int num_trace_bprintk_fmt; #endif #ifdef CONFIG_EVENT_TRACING - struct ftrace_event_call *trace_events; + struct ftrace_event_call **trace_events; unsigned int num_trace_events; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 18cd068..45508fe 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -128,28 +128,30 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata \ __attribute__((__aligned__(4))) __syscall_meta_##sname; \ static struct ftrace_event_call __used \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_events"))) \ event_enter_##sname = { \ .name = "sys_enter"#sname, \ .class = &event_class_syscall_enter, \ .event.funcs = &enter_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ }; \ + static struct ftrace_event_call __used \ + __attribute__((section("_ftrace_events"))) \ + *__event_enter_##sname = &event_enter_##sname; \ __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY) #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct syscall_metadata \ __attribute__((__aligned__(4))) __syscall_meta_##sname; \ static struct ftrace_event_call __used \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_events"))) \ event_exit_##sname = { \ .name = "sys_exit"#sname, \ .class = &event_class_syscall_exit, \ .event.funcs = &exit_syscall_print_funcs, \ .data = (void *)&__syscall_meta_##sname,\ }; \ + static struct ftrace_event_call __used \ + __attribute__((section("_ftrace_events"))) \ + *__event_exit_##sname = &event_exit_##sname; \ __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY) #define SYSCALL_METADATA(sname, nb) \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index e16610c..3e68366 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -446,14 +446,16 @@ static inline notrace int ftrace_get_offsets_##call( \ * .reg = ftrace_event_reg, * }; * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { + * static struct ftrace_event_call event_ = { * .name = "", * .class = event_class_