summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/arm/mach-dove/common.c4
-rw-r--r--arch/arm/mach-tegra/include/mach/sdhci.h29
-rw-r--r--arch/microblaze/Kconfig.debug4
-rw-r--r--arch/microblaze/Makefile2
-rw-r--r--arch/microblaze/configs/mmu_defconfig1
-rw-r--r--arch/microblaze/include/asm/pvr.h185
-rw-r--r--arch/microblaze/kernel/cpu/cpuinfo.c1
-rw-r--r--arch/microblaze/kernel/entry.S46
-rw-r--r--arch/microblaze/kernel/exceptions.c3
-rw-r--r--arch/microblaze/kernel/hw_exception_handler.S9
-rw-r--r--arch/microblaze/kernel/prom.c4
-rw-r--r--arch/microblaze/kernel/vmlinux.lds.S16
-rw-r--r--arch/microblaze/lib/memmove.c2
-rw-r--r--arch/microblaze/lib/muldi3.S121
-rw-r--r--arch/microblaze/lib/muldi3.c60
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/amd_nb.h13
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/nmi.h20
-rw-r--r--arch/x86/include/asm/numa_64.h2
-rw-r--r--arch/x86/include/asm/perf_event_p4.h3
-rw-r--r--arch/x86/kernel/amd_nb.c7
-rw-r--r--arch/x86/kernel/aperture_64.c44
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/entry_64.S36
-rw-r--r--arch/x86/kernel/kgdb.c7
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/traps.c102
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/mm/amdtopology_64.c86
-rw-r--r--arch/x86/mm/numa_64.c157
-rw-r--r--arch/x86/mm/srat_64.c26
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c2
-rw-r--r--arch/x86/pci/amd_bus.c33
-rw-r--r--drivers/char/ipmi/ipmi_watchdog.c2
-rw-r--r--drivers/mfd/sh_mobile_sdhi.c6
-rw-r--r--drivers/mmc/card/Kconfig1
-rw-r--r--drivers/mmc/core/Kconfig11
-rw-r--r--drivers/mmc/core/bus.c8
-rw-r--r--drivers/mmc/core/core.c206
-rw-r--r--drivers/mmc/core/core.h9
-rw-r--r--drivers/mmc/core/debugfs.c5
-rw-r--r--drivers/mmc/core/host.c206
-rw-r--r--drivers/mmc/core/host.h21
-rw-r--r--drivers/mmc/core/mmc.c91
-rw-r--r--drivers/mmc/core/mmc_ops.c101
-rw-r--r--drivers/mmc/core/mmc_ops.h1
-rw-r--r--drivers/mmc/core/sd.c16
-rw-r--r--drivers/mmc/core/sdio.c36
-rw-r--r--drivers/mmc/core/sdio_bus.c32
-rw-r--r--drivers/mmc/host/Kconfig37
-rw-r--r--drivers/mmc/host/Makefile3
-rw-r--r--drivers/mmc/host/davinci_mmc.c80
-rw-r--r--drivers/mmc/host/dw_mmc.c1796
-rw-r--r--drivers/mmc/host/dw_mmc.h168
-rw-r--r--drivers/mmc/host/mxcmmc.c53
-rw-r--r--drivers/mmc/host/sdhci-dove.c70
-rw-r--r--drivers/mmc/host/sdhci-pci.c161
-rw-r--r--drivers/mmc/host/sdhci-pltfm.c6
-rw-r--r--drivers/mmc/host/sdhci-pltfm.h2
-rw-r--r--drivers/mmc/host/sdhci-s3c.c66
-rw-r--r--drivers/mmc/host/sdhci-tegra.c257
-rw-r--r--drivers/mmc/host/sdhci.c45
-rw-r--r--drivers/mmc/host/sdhci.h3
-rw-r--r--drivers/mmc/host/tmio_mmc.c561
-rw-r--r--drivers/mmc/host/tmio_mmc.h228
-rw-r--r--drivers/rtc/class.c13
-rw-r--r--drivers/rtc/interface.c574
-rw-r--r--drivers/rtc/rtc-dev.c104
-rw-r--r--drivers/rtc/rtc-lib.c28
-rw-r--r--drivers/watchdog/hpwdt.c2
-rw-r--r--fs/ocfs2/Kconfig2
-rw-r--r--fs/ocfs2/alloc.c77
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/cluster/heartbeat.c246
-rw-r--r--fs/ocfs2/cluster/netdebug.c286
-rw-r--r--fs/ocfs2/cluster/tcp.c145
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h33
-rw-r--r--fs/ocfs2/dlm/dlmast.c76
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h86
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c200
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c10
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmthread.c132
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/ocfs2.h5
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c425
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c235
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h22
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c22
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c92
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h59
-rw-r--r--fs/xfs/quota/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_alloc.c351
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_buf_item.c32
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_extfree_item.c97
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_iget.c79
-rw-r--r--fs/xfs/xfs_inode.c54
-rw-r--r--fs/xfs/xfs_inode.h15
-rw-r--r--fs/xfs/xfs_inode_item.c90
-rw-r--r--fs/xfs/xfs_iomap.c233
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_log.c739
-rw-r--r--fs/xfs/xfs_log_cil.c17
-rw-r--r--fs/xfs/xfs_log_priv.h127
-rw-r--r--fs/xfs/xfs_log_recover.c620
-rw-r--r--fs/xfs/xfs_mount.c23
-rw-r--r--fs/xfs/xfs_mount.h14
-rw-r--r--fs/xfs/xfs_trans.c79
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c232
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h35
-rw-r--r--fs/xfs/xfs_vnodeops.c61
-rw-r--r--include/linux/dynamic_debug.h18
-rw-r--r--include/linux/mfd/tmio.h5
-rw-r--r--include/linux/mmc/dw_mmc.h217
-rw-r--r--include/linux/mmc/host.h19
-rw-r--r--include/linux/mmc/mmc.h2
-rw-r--r--include/linux/mmc/sdhci.h6
-rw-r--r--include/linux/pci_ids.h8
-rw-r--r--include/linux/rtc.h51
-rw-r--r--include/linux/tracepoint.h4
-rw-r--r--include/trace/define_trace.h10
-rw-r--r--include/trace/events/skb.h4
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/perf_event.c82
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--lib/dynamic_debug.c9
-rw-r--r--tools/perf/Makefile2
-rw-r--r--tools/perf/builtin-record.c3
-rw-r--r--tools/perf/builtin-sched.c5
-rw-r--r--tools/perf/builtin-stat.c5
-rw-r--r--tools/perf/builtin-test.c116
-rw-r--r--tools/perf/builtin-top.c2
-rw-r--r--tools/perf/util/evsel.c87
-rw-r--r--tools/perf/util/evsel.h2
-rw-r--r--tools/perf/util/parse-events.c74
-rw-r--r--tools/perf/util/session.c2
165 files changed, 8304 insertions, 3880 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index bb6c1ac..42f991e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1785,7 +1785,8 @@ S: Maintained
F: drivers/usb/atm/cxacru.c
CONFIGFS
-M: Joel Becker <joel.becker@oracle.com>
+M: Joel Becker <jlbec@evilplan.org>
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/configfs.git
S: Supported
F: fs/configfs/
F: include/linux/configfs.h
@@ -4549,7 +4550,7 @@ F: include/linux/oprofile.h
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
M: Mark Fasheh <mfasheh@suse.com>
-M: Joel Becker <joel.becker@oracle.com>
+M: Joel Becker <jlbec@evilplan.org>
L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
W: http://oss.oracle.com/projects/ocfs2/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2.git
diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c
index f7a1258..fe627ab 100644
--- a/arch/arm/mach-dove/common.c
+++ b/arch/arm/mach-dove/common.c
@@ -770,7 +770,7 @@ static struct resource dove_sdio0_resources[] = {
};
static struct platform_device dove_sdio0 = {
- .name = "sdhci-mv",
+ .name = "sdhci-dove",
.id = 0,
.dev = {
.dma_mask = &sdio_dmamask,
@@ -798,7 +798,7 @@ static struct resource dove_sdio1_resources[] = {
};
static struct platform_device dove_sdio1 = {
- .name = "sdhci-mv",
+ .name = "sdhci-dove",
.id = 1,
.dev = {
.dma_mask = &sdio_dmamask,
diff --git a/arch/arm/mach-tegra/include/mach/sdhci.h b/arch/arm/mach-tegra/include/mach/sdhci.h
new file mode 100644
index 0000000..3ad086e
--- /dev/null
+++ b/arch/arm/mach-tegra/include/mach/sdhci.h
@@ -0,0 +1,29 @@
+/*
+ * include/asm-arm/arch-tegra/include/mach/sdhci.h
+ *
+ * Copyright (C) 2009 Palm, Inc.
+ * Author: Yvonne Yip <y@palm.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __ASM_ARM_ARCH_TEGRA_SDHCI_H
+#define __ASM_ARM_ARCH_TEGRA_SDHCI_H
+
+#include <linux/mmc/host.h>
+
+struct tegra_sdhci_platform_data {
+ int cd_gpio;
+ int wp_gpio;
+ int power_gpio;
+ int is_8bit;
+};
+
+#endif
diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug
index e66e25c..012e377 100644
--- a/arch/microblaze/Kconfig.debug
+++ b/arch/microblaze/Kconfig.debug
@@ -23,8 +23,4 @@ config HEART_BEAT
This option turns on/off heart beat kernel functionality.
First GPIO node is taken.
-config DEBUG_BOOTMEM
- depends on DEBUG_KERNEL
- bool "Debug BOOTMEM initialization"
-
endmenu
diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile
index 15f1f1d..6f432e6 100644
--- a/arch/microblaze/Makefile
+++ b/arch/microblaze/Makefile
@@ -17,7 +17,7 @@ export CPU_VER CPU_MAJOR CPU_MINOR CPU_REV
# The various CONFIG_XILINX cpu features options are integers 0/1/2...
# rather than bools y/n
-# Work out HW multipler support. This is icky.
+# Work out HW multipler support. This is tricky.
# 1. Spartan2 has no HW multiplers.
# 2. MicroBlaze v3.x always uses them, except in Spartan 2
# 3. All other FPGa/CPU ver combos, we can trust the CONFIG_ settings
diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig
index 8b422b1..ab8fbe7 100644
--- a/arch/microblaze/configs/mmu_defconfig
+++ b/arch/microblaze/configs/mmu_defconfig
@@ -66,5 +66,4 @@ CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_INFO=y
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
CONFIG_EARLY_PRINTK=y
-CONFIG_DEBUG_BOOTMEM=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/microblaze/include/asm/pvr.h b/arch/microblaze/include/asm/pvr.h
index 37db96a..a10bec6 100644
--- a/arch/microblaze/include/asm/pvr.h
+++ b/arch/microblaze/include/asm/pvr.h
@@ -1,9 +1,9 @@
/*
* Support for the MicroBlaze PVR (Processor Version Register)
*
- * Copyright (C) 2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2009 - 2011 Michal Simek <monstr@monstr.eu>
* Copyright (C) 2007 John Williams <john.williams@petalogix.com>
- * Copyright (C) 2007 - 2009 PetaLogix
+ * Copyright (C) 2007 - 2011 PetaLogix
*
* This file is subject to the terms and conditions of the GNU General
* Public License. See the file COPYING in the main directory of this
@@ -46,11 +46,11 @@ struct pvr_s {
#define PVR2_I_LMB_MASK 0x10000000
#define PVR2_INTERRUPT_IS_EDGE_MASK 0x08000000
#define PVR2_EDGE_IS_POSITIVE_MASK 0x04000000
-#define PVR2_D_PLB_MASK 0x02000000 /* new */
-#define PVR2_I_PLB_MASK 0x01000000 /* new */
-#define PVR2_INTERCONNECT 0x00800000 /* new */
-#define PVR2_USE_EXTEND_FSL 0x00080000 /* new */
-#define PVR2_USE_FSL_EXC 0x00040000 /* new */
+#define PVR2_D_PLB_MASK 0x02000000 /* new */
+#define PVR2_I_PLB_MASK 0x01000000 /* new */
+#define PVR2_INTERCONNECT 0x00800000 /* new */
+#define PVR2_USE_EXTEND_FSL 0x00080000 /* new */
+#define PVR2_USE_FSL_EXC 0x00040000 /* new */
#define PVR2_USE_MSR_INSTR 0x00020000
#define PVR2_USE_PCMP_INSTR 0x00010000
#define PVR2_AREA_OPTIMISED 0x00008000
@@ -59,7 +59,7 @@ struct pvr_s {
#define PVR2_USE_HW_MUL_MASK 0x00001000
#define PVR2_USE_FPU_MASK 0x00000800
#define PVR2_USE_MUL64_MASK 0x00000400
-#define PVR2_USE_FPU2_MASK 0x00000200 /* new */
+#define PVR2_USE_FPU2_MASK 0x00000200 /* new */
#define PVR2_USE_IPLBEXC 0x00000100
#define PVR2_USE_DPLBEXC 0x00000080
#define PVR2_OPCODE_0x0_ILL_MASK 0x00000040
@@ -122,96 +122,103 @@ struct pvr_s {
/* PVR access macros */
-#define PVR_IS_FULL(pvr) (pvr.pvr[0] & PVR0_PVR_FULL_MASK)
-#define PVR_USE_BARREL(pvr) (pvr.pvr[0] & PVR0_USE_BARREL_MASK)
-#define PVR_USE_DIV(pvr) (pvr.pvr[0] & PVR0_USE_DIV_MASK)
-#define PVR_USE_HW_MUL(pvr) (pvr.pvr[0] & PVR0_USE_HW_MUL_MASK)
-#define PVR_USE_FPU(pvr) (pvr.pvr[0] & PVR0_USE_FPU_MASK)
-#define PVR_USE_FPU2(pvr) (pvr.pvr[2] & PVR2_USE_FPU2_MASK)
-#define PVR_USE_ICACHE(pvr) (pvr.pvr[0] & PVR0_USE_ICACHE_MASK)
-#define PVR_USE_DCACHE(pvr) (pvr.pvr[0] & PVR0_USE_DCACHE_MASK)
-#define PVR_VERSION(pvr) ((pvr.pvr[0] & PVR0_VERSION_MASK) >> 8)
-#define PVR_USER1(pvr) (pvr.pvr[0] & PVR0_USER1_MASK)
-#define PVR_USER2(pvr) (pvr.pvr[1] & PVR1_USER2_MASK)
-
-#define PVR_D_OPB(pvr) (pvr.pvr[2] & PVR2_D_OPB_MASK)
-#define PVR_D_LMB(pvr) (pvr.pvr[2] & PVR2_D_LMB_MASK)
-#define PVR_I_OPB(pvr) (pvr.pvr[2] & PVR2_I_OPB_MASK)
-#define PVR_I_LMB(pvr) (pvr.pvr[2] & PVR2_I_LMB_MASK)
-#define PVR_INTERRUPT_IS_EDGE(pvr) \
- (pvr.pvr[2] & PVR2_INTERRUPT_IS_EDGE_MASK)
-#define PVR_EDGE_IS_POSITIVE(pvr) \
- (pvr.pvr[2] & PVR2_EDGE_IS_POSITIVE_MASK)
-#define PVR_USE_MSR_INSTR(pvr) (pvr.pvr[2] & PVR2_USE_MSR_INSTR)
-#define PVR_USE_PCMP_INSTR(pvr) (pvr.pvr[2] & PVR2_USE_PCMP_INSTR)
-#define PVR_AREA_OPTIMISED(pvr) (pvr.pvr[2] & PVR2_AREA_OPTIMISED)
-#define PVR_USE_MUL64(pvr) (pvr.pvr[2] & PVR2_USE_MUL64_MASK)
-#define PVR_OPCODE_0x0_ILLEGAL(pvr) \
- (pvr.pvr[2] & PVR2_OPCODE_0x0_ILL_MASK)
-#define PVR_UNALIGNED_EXCEPTION(pvr) \
- (pvr.pvr[2] & PVR2_UNALIGNED_EXC_MASK)
-#define PVR_ILL_OPCODE_EXCEPTION(pvr) \
- (pvr.pvr[2] & PVR2_ILL_OPCODE_EXC_MASK)
-#define PVR_IOPB_BUS_EXCEPTION(pvr) \
- (pvr.pvr[2] & PVR2_IOPB_BUS_EXC_MASK)
-#define PVR_DOPB_BUS_EXCEPTION(pvr) \
- (pvr.pvr[2] & PVR2_DOPB_BUS_EXC_MASK)
-#define PVR_DIV_ZERO_EXCEPTION(pvr) \
- (pvr.pvr[2] & PVR2_DIV_ZERO_EXC_MASK)
-#define PVR_FPU_EXCEPTION(pvr) (pvr.pvr[2] & PVR2_FPU_EXC_MASK)
-#define PVR_FSL_EXCEPTION(pvr) (pvr.pvr[2] & PVR2_USE_EXTEND_FSL)
-
-#define PVR_DEBUG_ENABLED(pvr) (pvr.pvr[3] & PVR3_DEBUG_ENABLED_MASK)
-#define PVR_NUMBER_OF_PC_BRK(pvr) \
- ((pvr.pvr[3] & PVR3_NUMBER_OF_PC_BRK_MASK) >> 25)
-#define PVR_NUMBER_OF_RD_ADDR_BRK(pvr) \
- ((pvr.pvr[3] & PVR3_NUMBER_OF_RD_ADDR_BRK_MASK) >> 19)
-#define PVR_NUMBER_OF_WR_ADDR_BRK(pvr) \
- ((pvr.pvr[3] & PVR3_NUMBER_OF_WR_ADDR_BRK_MASK) >> 13)
-#define PVR_FSL_LINKS(pvr) ((pvr.pvr[3] & PVR3_FSL_LINKS_MASK) >> 7)
-
-#define PVR_ICACHE_ADDR_TAG_BITS(pvr) \
- ((pvr.pvr[4] & PVR4_ICACHE_ADDR_TAG_BITS_MASK) >> 26)
-#define PVR_ICACHE_USE_FSL(pvr) (pvr.pvr[4] & PVR4_ICACHE_USE_FSL_MASK)
-#define PVR_ICACHE_ALLOW_WR(pvr) (pvr.pvr[4] & PVR4_ICACHE_ALLOW_WR_MASK)
-#define PVR_ICACHE_LINE_LEN(pvr) \
- (1 << ((pvr.pvr[4] & PVR4_ICACHE_LINE_LEN_MASK) >> 21))
-#define PVR_ICACHE_BYTE_SIZE(pvr) \
- (1 << ((pvr.pvr[4] & PVR4_ICACHE_BYTE_SIZE_MASK) >> 16))
-
-#define PVR_DCACHE_ADDR_TAG_BITS(pvr) \
- ((pvr.pvr[5] & PVR5_DCACHE_ADDR_TAG_BITS_MASK) >> 26)
-#define PVR_DCACHE_USE_FSL(pvr) (pvr.pvr[5] & PVR5_DCACHE_USE_FSL_MASK)
-#define PVR_DCACHE_ALLOW_WR(pvr) (pvr.pvr[5] & PVR5_DCACHE_ALLOW_WR_MASK)
+#define PVR_IS_FULL(_pvr) (_pvr.pvr[0] & PVR0_PVR_FULL_MASK)
+#define PVR_USE_BARREL(_pvr) (_pvr.pvr[0] & PVR0_USE_BARREL_MASK)
+#define PVR_USE_DIV(_pvr) (_pvr.pvr[0] & PVR0_USE_DIV_MASK)
+#define PVR_USE_HW_MUL(_pvr) (_pvr.pvr[0] & PVR0_USE_HW_MUL_MASK)
+#define PVR_USE_FPU(_pvr) (_pvr.pvr[0] & PVR0_USE_FPU_MASK)
+#define PVR_USE_FPU2(_pvr) (_pvr.pvr[2] & PVR2_USE_FPU2_MASK)
+#define PVR_USE_ICACHE(_pvr) (_pvr.pvr[0] & PVR0_USE_ICACHE_MASK)
+#define PVR_USE_DCACHE(_pvr) (_pvr.pvr[0] & PVR0_USE_DCACHE_MASK)
+#define PVR_VERSION(_pvr) ((_pvr.pvr[0] & PVR0_VERSION_MASK) >> 8)
+#define PVR_USER1(_pvr) (_pvr.pvr[0] & PVR0_USER1_MASK)
+#define PVR_USER2(_pvr) (_pvr.pvr[1] & PVR1_USER2_MASK)
+
+#define PVR_D_OPB(_pvr) (_pvr.pvr[2] & PVR2_D_OPB_MASK)
+#define PVR_D_LMB(_pvr) (_pvr.pvr[2] & PVR2_D_LMB_MASK)
+#define PVR_I_OPB(_pvr) (_pvr.pvr[2] & PVR2_I_OPB_MASK)
+#define PVR_I_LMB(_pvr) (_pvr.pvr[2] & PVR2_I_LMB_MASK)
+#define PVR_INTERRUPT_IS_EDGE(_pvr) \
+ (_pvr.pvr[2] & PVR2_INTERRUPT_IS_EDGE_MASK)
+#define PVR_EDGE_IS_POSITIVE(_pvr) \
+ (_pvr.pvr[2] & PVR2_EDGE_IS_POSITIVE_MASK)
+#define PVR_USE_MSR_INSTR(_pvr) (_pvr.pvr[2] & PVR2_USE_MSR_INSTR)
+#define PVR_USE_PCMP_INSTR(_pvr) (_pvr.pvr[2] & PVR2_USE_PCMP_INSTR)
+#define PVR_AREA_OPTIMISED(_pvr) (_pvr.pvr[2] & PVR2_AREA_OPTIMISED)
+#define PVR_USE_MUL64(_pvr) (_pvr.pvr[2] & PVR2_USE_MUL64_MASK)
+#define PVR_OPCODE_0x0_ILLEGAL(_pvr) \
+ (_pvr.pvr[2] & PVR2_OPCODE_0x0_ILL_MASK)
+#define PVR_UNALIGNED_EXCEPTION(_pvr) \
+ (_pvr.pvr[2] & PVR2_UNALIGNED_EXC_MASK)
+#define PVR_ILL_OPCODE_EXCEPTION(_pvr) \
+ (_pvr.pvr[2] & PVR2_ILL_OPCODE_EXC_MASK)
+#define PVR_IOPB_BUS_EXCEPTION(_pvr) \
+ (_pvr.pvr[2] & PVR2_IOPB_BUS_EXC_MASK)
+#define PVR_DOPB_BUS_EXCEPTION(_pvr) \
+ (_pvr.pvr[2] & PVR2_DOPB_BUS_EXC_MASK)
+#define PVR_DIV_ZERO_EXCEPTION(_pvr) \
+ (_pvr.pvr[2] & PVR2_DIV_ZERO_EXC_MASK)
+#define PVR_FPU_EXCEPTION(_pvr) (_pvr.pvr[2] & PVR2_FPU_EXC_MASK)
+#define PVR_FSL_EXCEPTION(_pvr) (_pvr.pvr[2] & PVR2_USE_EXTEND_FSL)
+
+#define PVR_DEBUG_ENABLED(_pvr) (_pvr.pvr[3] & PVR3_DEBUG_ENABLED_MASK)
+#define PVR_NUMBER_OF_PC_BRK(_pvr) \
+ ((_pvr.pvr[3] & PVR3_NUMBER_OF_PC_BRK_MASK) >> 25)
+#define PVR_NUMBER_OF_RD_ADDR_BRK(_pvr) \
+ ((_pvr.pvr[3] & PVR3_NUMBER_OF_RD_ADDR_BRK_MASK) >> 19)
+#define PVR_NUMBER_OF_WR_ADDR_BRK(_pvr) \
+ ((_pvr.pvr[3] & PVR3_NUMBER_OF_WR_ADDR_BRK_MASK) >> 13)
+#define PVR_FSL_LINKS(_pvr) ((_pvr.pvr[3] & PVR3_FSL_LINKS_MASK) >> 7)
+
+#define PVR_ICACHE_ADDR_TAG_BITS(_pvr) \
+ ((_pvr.pvr[4] & PVR4_ICACHE_ADDR_TAG_BITS_MASK) >> 26)
+#define PVR_ICACHE_USE_FSL(_pvr) \
+ (_pvr.pvr[4] & PVR4_ICACHE_USE_FSL_MASK)
+#define PVR_ICACHE_ALLOW_WR(_pvr) \
+ (_pvr.pvr[4] & PVR4_ICACHE_ALLOW_WR_MASK)
+#define PVR_ICACHE_LINE_LEN(_pvr) \
+ (1 << ((_pvr.pvr[4] & PVR4_ICACHE_LINE_LEN_MASK) >> 21))
+#define PVR_ICACHE_BYTE_SIZE(_pvr) \
+ (1 << ((_pvr.pvr[4] & PVR4_ICACHE_BYTE_SIZE_MASK) >> 16))
+
+#define PVR_DCACHE_ADDR_TAG_BITS(_pvr) \
+ ((_pvr.pvr[5] & PVR5_DCACHE_ADDR_TAG_BITS_MASK) >> 26)
+#define PVR_DCACHE_USE_FSL(_pvr) (_pvr.pvr[5] & PVR5_DCACHE_USE_FSL_MASK)
+#define PVR_DCACHE_ALLOW_WR(_pvr) \
+ (_pvr.pvr[5] & PVR5_DCACHE_ALLOW_WR_MASK)
/* FIXME two shifts on one line needs any comment */
-#define PVR_DCACHE_LINE_LEN(pvr) \
- (1 << ((pvr.pvr[5] & PVR5_DCACHE_LINE_LEN_MASK) >> 21))
-#define PVR_DCACHE_BYTE_SIZE(pvr) \
- (1 << ((pvr.pvr[5] & PVR5_DCACHE_BYTE_SIZE_MASK) >> 16))
+#define PVR_DCACHE_LINE_LEN(_pvr) \
+ (1 << ((_pvr.pvr[5] & PVR5_DCACHE_LINE_LEN_MASK) >> 21))
+#define PVR_DCACHE_BYTE_SIZE(_pvr) \
+ (1 << ((_pvr.pvr[5] & PVR5_DCACHE_BYTE_SIZE_MASK) >> 16))
-#define PVR_DCACHE_USE_WRITEBACK(pvr) \
- ((pvr.pvr[5] & PVR5_DCACHE_USE_WRITEBACK) >> 14)
+#define PVR_DCACHE_USE_WRITEBACK(_pvr) \
+ ((_pvr.pvr[5] & PVR5_DCACHE_USE_WRITEBACK) >> 14)
-#define PVR_ICACHE_BASEADDR(pvr) (pvr.pvr[6] & PVR6_ICACHE_BASEADDR_MASK)
-#define PVR_ICACHE_HIGHADDR(pvr) (pvr.pvr[7] & PVR7_ICACHE_HIGHADDR_MASK)
+#define PVR_ICACHE_BASEADDR(_pvr) \
+ (_pvr.pvr[6] & PVR6_ICACHE_BASEADDR_MASK)
+#define PVR_ICACHE_HIGHADDR(_pvr) \
+ (_pvr.pvr[7] & PVR7_ICACHE_HIGHADDR_MASK)
+#define PVR_DCACHE_BASEADDR(_pvr) \
+ (_pvr.pvr[8] & PVR8_DCACHE_BASEADDR_MASK)
+#define PVR_DCACHE_HIGHADDR(_pvr) \
+ (_pvr.pvr[9] & PVR9_DCACHE_HIGHADDR_MASK)
-#define PVR_DCACHE_BASEADDR(pvr) (pvr.pvr[8] & PVR8_DCACHE_BASEADDR_MASK)
-#define PVR_DCACHE_HIGHADDR(pvr) (pvr.pvr[9] & PVR9_DCACHE_HIGHADDR_MASK)
+#define PVR_TARGET_FAMILY(_pvr) \
+ ((_pvr.pvr[10] & PVR10_TARGET_FAMILY_MASK) >> 24)
-#define PVR_TARGET_FAMILY(pvr) ((pvr.pvr[10] & PVR10_TARGET_FAMILY_MASK) >> 24)
-
-#define PVR_MSR_RESET_VALUE(pvr) \
- (pvr.pvr[11] & PVR11_MSR_RESET_VALUE_MASK)
+#define PVR_MSR_RESET_VALUE(_pvr) \
+ (_pvr.pvr[11] & PVR11_MSR_RESET_VALUE_MASK)
/* mmu */
-#define PVR_USE_MMU(pvr) ((pvr.pvr[11] & PVR11_USE_MMU) >> 30)
-#define PVR_MMU_ITLB_SIZE(pvr) (pvr.pvr[11] & PVR11_MMU_ITLB_SIZE)
-#define PVR_MMU_DTLB_SIZE(pvr) (pvr.pvr[11] & PVR11_MMU_DTLB_SIZE)
-#define PVR_MMU_TLB_ACCESS(pvr) (pvr.pvr[11] & PVR11_MMU_TLB_ACCESS)
-#define PVR_MMU_ZONES(pvr) (pvr.pvr[11] & PVR11_MMU_ZONES)
+#define PVR_USE_MMU(_pvr) ((_pvr.pvr[11] & PVR11_USE_MMU) >> 30)
+#define PVR_MMU_ITLB_SIZE(_pvr) (_pvr.pvr[11] & PVR11_MMU_ITLB_SIZE)
+#define PVR_MMU_DTLB_SIZE(_pvr) (_pvr.pvr[11] & PVR11_MMU_DTLB_SIZE)
+#define PVR_MMU_TLB_ACCESS(_pvr) (_pvr.pvr[11] & PVR11_MMU_TLB_ACCESS)
+#define PVR_MMU_ZONES(_pvr) (_pvr.pvr[11] & PVR11_MMU_ZONES)
/* endian */
-#define PVR_ENDIAN(pvr) (pvr.pvr[0] & PVR0_ENDI)
+#define PVR_ENDIAN(_pvr) (_pvr.pvr[0] & PVR0_ENDI)
int cpu_has_pvr(void);
void get_pvr(struct pvr_s *pvr);
diff --git a/arch/microblaze/kernel/cpu/cpuinfo.c b/arch/microblaze/kernel/cpu/cpuinfo.c
index 87c79fa..2c309fc 100644
--- a/arch/microblaze/kernel/cpu/cpuinfo.c
+++ b/arch/microblaze/kernel/cpu/cpuinfo.c
@@ -32,6 +32,7 @@ const struct cpu_ver_key cpu_ver_lookup[] = {
{"7.30.a", 0x10},
{"7.30.b", 0x11},
{"8.00.a", 0x12},
+ {"8.00.b", 0x13},
{NULL, 0},
};
diff --git a/arch/microblaze/kernel/entry.S b/arch/microblaze/kernel/entry.S
index 819238b..41c30cd 100644
--- a/arch/microblaze/kernel/entry.S
+++ b/arch/microblaze/kernel/entry.S
@@ -287,25 +287,44 @@
* are masked. This is nice, means we don't have to CLI before state save
*/
C_ENTRY(_user_exception):
- addi r14, r14, 4 /* return address is 4 byte after call */
swi r1, r0, TOPHYS(PER_CPU(ENTRY_SP)) /* save stack */
+ addi r14, r14, 4 /* return address is 4 byte after call */
+
+ mfs r1, rmsr
+ nop
+ andi r1, r1, MSR_UMS
+ bnei r1, 1f
+
+/* Kernel-mode state save - kernel execve */
+ lwi r1, r0, TOPHYS(PER_CPU(ENTRY_SP)); /* Reload kernel stack-ptr*/
+ tophys(r1,r1);
+
+ addik r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack. */
+ SAVE_REGS
+ swi r1, r1, PTO + PT_MODE; /* pt_regs -> kernel mode */
+ brid 2f;
+ nop; /* Fill delay slot */
+
+/* User-mode state save. */
+1:
lwi r1, r0, TOPHYS(PER_CPU(CURRENT_SAVE)); /* get saved current */
tophys(r1,r1);
lwi r1, r1, TS_THREAD_INFO; /* get stack from task_struct */
- /* MS these three instructions can be added to one */
- /* addik r1, r1, THREAD_SIZE; */
- /* tophys(r1,r1); */
- /* addik r1, r1, -STATE_SAVE_SIZE; */
- addik r1, r1, THREAD_SIZE + CONFIG_KERNEL_BASE_ADDR - CONFIG_KERNEL_START - STATE_SAVE_SIZE;
+/* calculate kernel stack pointer from task struct 8k */
+ addik r1, r1, THREAD_SIZE;
+ tophys(r1,r1);
+
+ addik r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack. */
SAVE_REGS
swi r0, r1, PTO + PT_R3
swi r0, r1, PTO + PT_R4
+ swi r0, r1, PTO + PT_MODE; /* Was in user-mode. */
lwi r11, r0, TOPHYS(PER_CPU(ENTRY_SP));
swi r11, r1, PTO+PT_R1; /* Store user SP. */
clear_ums;
- lwi CURRENT_TASK, r0, TOPHYS(PER_CPU(CURRENT_SAVE));
+2: lwi CURRENT_TASK, r0, TOPHYS(PER_CPU(CURRENT_SAVE));
/* Save away the syscall number. */
swi r12, r1, PTO+PT_R0;
tovirt(r1,r1)
@@ -375,6 +394,9 @@ C_ENTRY(ret_from_trap):
swi r3, r1, PTO + PT_R3
swi r4, r1, PTO + PT_R4
+ lwi r11, r1, PTO + PT_MODE;
+/* See if returning to kernel mode, if so, skip resched &c. */
+ bnei r11, 2f;
/* We're returning to user mode, so check for various conditions that
* trigger rescheduling. */
/* FIXME: Restructure all these flag checks. */
@@ -417,6 +439,16 @@ C_ENTRY(ret_from_trap):
RESTORE_REGS;
addik r1, r1, STATE_SAVE_SIZE /* Clean up stack space. */
lwi r1, r1, PT_R1 - PT_SIZE;/* Restore user stack pointer. */
+ bri 6f;
+
+/* Return to kernel state. */
+2: set_bip; /* Ints masked for state restore */
+ VM_OFF;
+ tophys(r1,r1);
+ RESTORE_REGS;
+ addik r1, r1, STATE_SAVE_SIZE /* Clean up stack space. */
+ tovirt(r1,r1);
+6:
TRAP_return: /* Make global symbol for debugging */
rtbd r14, 0; /* Instructions to return from an IRQ */
nop;
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index 478f294..a7fa6ae7 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -25,6 +25,7 @@
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <asm/current.h>
+#include <asm/cacheflush.h>
#define MICROBLAZE_ILL_OPCODE_EXCEPTION 0x02
#define MICROBLAZE_IBUS_EXCEPTION 0x03
@@ -52,6 +53,8 @@ void die(const char *str, struct pt_regs *fp, long err)
void sw_exception(struct pt_regs *regs)
{
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->r16);
+ flush_dcache_range(regs->r16, regs->r16 + 0x4);
+ flush_icache_range(regs->r16, regs->r16 + 0x4);
}
void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
diff --git a/arch/microblaze/kernel/hw_exception_handler.S b/arch/microblaze/kernel/hw_exception_handler.S
index 7811954..25f6e07 100644
--- a/arch/microblaze/kernel/hw_exception_handler.S
+++ b/arch/microblaze/kernel/hw_exception_handler.S
@@ -945,11 +945,20 @@ store3: sbi r3, r4, 2;
store4: sbi r3, r4, 3; /* Delay slot */
ex_shw_vm:
/* Store the lower half-word, byte-by-byte into destination address */
+#ifdef __MICROBLAZEEL__
+ lbui r3, r5, 0;
+store5: sbi r3, r4, 0;
+ lbui r3, r5, 1;
+ brid ret_from_exc;
+store6: sbi r3, r4, 1; /* Delay slot */
+#else
lbui r3, r5, 2;
store5: sbi r3, r4, 0;
lbui r3, r5, 3;
brid ret_from_exc;
store6: sbi r3, r4, 1; /* Delay slot */
+#endif
+
ex_sw_end_vm: /* Exception handling of store word, ends. */
/* We have to prevent cases that get/put_user macros get unaligned pointer
diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index a105301..c881393 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -61,14 +61,12 @@ static int __init early_init_dt_scan_serial(unsigned long node,
char *p;
int *addr;
- pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
+ pr_debug("search \"serial\", depth: %d, uname: %s\n", depth, uname);
/* find all serial nodes */
if (strncmp(uname, "serial", 6) != 0)
return 0;
- early_init_dt_check_for_initrd(node);
-
/* find compatible node with uartlite */
p = of_get_flat_dt_prop(node, "compatible", &l);
if ((strncmp(p, "xlnx,xps-uartlite", 17) != 0) &&
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index 96a88c3..3451bde 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -123,20 +123,10 @@ SECTIONS {
__init_end_before_initramfs = .;
- .init.ramfs ALIGN(PAGE_SIZE) : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- . = ALIGN(4);
- LONG(0);
-/*
- * FIXME this can break initramfs for MMU.
- * Pad init.ramfs up to page boundary,
- * so that __init_end == __bss_start. This will make image.elf
- * consistent with the image.bin
- */
- /* . = ALIGN(PAGE_SIZE); */
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ INIT_RAM_FS
}
+
__init_end = .;
.bss ALIGN (PAGE_SIZE) : AT(ADDR(.bss) - LOAD_OFFSET) {
diff --git a/arch/microblaze/lib/memmove.c b/arch/microblaze/lib/memmove.c
index 123e361..810fd68 100644
--- a/arch/microblaze/lib/memmove.c
+++ b/arch/microblaze/lib/memmove.c
@@ -182,7 +182,7 @@ void *memmove(void *v_dst, const void *v_src, __kernel_size_t c)
for (; c >= 4; c -= 4) {
value = *--i_src;
*--i_dst = buf_hold | ((value & 0xFF000000)>> 24);
- buf_hold = (value & 0xFFFFFF) << 8;;
+ buf_hold = (value & 0xFFFFFF) << 8;
}
#endif
/* Realign the source */
diff --git a/arch/microblaze/lib/muldi3.S b/arch/microblaze/lib/muldi3.S
deleted file mode 100644
index ceeaa8c..0000000
--- a/arch/microblaze/lib/muldi3.S
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <linux/linkage.h>
-
-/*
- * Multiply operation for 64 bit integers, for devices with hard multiply
- * Input : Operand1[H] in Reg r5
- * Operand1[L] in Reg r6
- * Operand2[H] in Reg r7
- * Operand2[L] in Reg r8
- * Output: Result[H] in Reg r3
- * Result[L] in Reg r4
- *
- * Explaination:
- *
- * Both the input numbers are divided into 16 bit number as follows
- * op1 = A B C D
- * op2 = E F G H
- * result = D * H
- * + (C * H + D * G) << 16
- * + (B * H + C * G + D * F) << 32
- * + (A * H + B * G + C * F + D * E) << 48
- *
- * Only 64 bits of the output are considered
- */
-
- .text
- .globl __muldi3
- .type __muldi3, @function
- .ent __muldi3
-
-__muldi3:
- addi r1, r1, -40
-
-/* Save the input operands on the caller's stack */
- swi r5, r1, 44
- swi r6, r1, 48
- swi r7, r1, 52
- swi r8, r1, 56
-
-/* Store all the callee saved registers */
- sw r20, r1, r0
- swi r21, r1, 4
- swi r22, r1, 8
- swi r23, r1, 12
- swi r24, r1, 16
- swi r25, r1, 20
- swi r26, r1, 24
- swi r27, r1, 28
-
-/* Load all the 16 bit values for A thru H */
- lhui r20, r1, 44 /* A */
- lhui r21, r1, 46 /* B */
- lhui r22, r1, 48 /* C */
- lhui r23, r1, 50 /* D */
- lhui r24, r1, 52 /* E */
- lhui r25, r1, 54 /* F */
- lhui r26, r1, 56 /* G */
- lhui r27, r1, 58 /* H */
-
-/* D * H ==> LSB of the result on stack ==> Store1 */
- mul r9, r23, r27
- swi r9, r1, 36 /* Pos2 and Pos3 */
-
-/* Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2 */
-/* Store the carry generated in position 2 for Pos 3 */
- lhui r11, r1, 36 /* Pos2 */
- mul r9, r22, r27 /* C * H */
- mul r10, r23, r26 /* D * G */
- add r9, r9, r10
- addc r12, r0, r0
- add r9, r9, r11
- addc r12, r12, r0 /* Store the Carry */
- shi r9, r1, 36 /* Store Pos2 */
- swi r9, r1, 32
- lhui r11, r1, 32
- shi r11, r1, 34 /* Store Pos1 */
-
-/* Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1 */
- mul r9, r21, r27 /* B * H */
- mul r10, r22, r26 /* C * G */
- mul r7, r23, r25 /* D * F */
- add r9, r9, r11
- add r9, r9, r10
- add r9, r9, r7
- swi r9, r1, 32 /* Pos0 and Pos1 */
-
-/* Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0 */
- lhui r11, r1, 32 /* Pos0 */
- mul r9, r20, r27 /* A * H */
- mul r10, r21, r26 /* B * G */
- mul r7, r22, r25 /* C * F */
- mul r8, r23, r24 /* D * E */
- add r9, r9, r11
- add r9, r9, r10
- add r9, r9, r7
- add r9, r9, r8
- sext16 r9, r9 /* Sign extend the MSB */
- shi r9, r1, 32
-
-/* Move results to r3 and r4 */
- lhui r3, r1, 32
- add r3, r3, r12
- shi r3, r1, 32
- lwi r3, r1, 32 /* Hi Part */
- lwi r4, r1, 36 /* Lo Part */
-
-/* Restore Callee saved registers */
- lw r20, r1, r0
- lwi r21, r1, 4
- lwi r22, r1, 8
- lwi r23, r1, 12
- lwi r24, r1, 16
- lwi r25, r1, 20
- lwi r26, r1, 24
- lwi r27, r1, 28
-
-/* Restore Frame and return */
- rtsd r15, 8
- addi r1, r1, 40
-
-.size __muldi3, . - __muldi3
-.end __muldi3
diff --git a/arch/microblaze/lib/muldi3.c b/arch/microblaze/lib/muldi3.c
new file mode 100644
index 0000000..d4860e1
--- /dev/null
+++ b/arch/microblaze/lib/muldi3.c
@@ -0,0 +1,60 @@
+#include <linux/module.h>
+
+#include "libgcc.h"
+
+#define DWtype long long
+#define UWtype unsigned long
+#define UHWtype unsigned short
+
+#define W_TYPE_SIZE 32
+
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* If we still don't have umul_ppmm, define it using plain C. */
+#if !defined(umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+ do { \
+ UWtype __x0, __x1, __x2, __x3; \
+ UHWtype __ul, __vl, __uh, __vh; \
+ \
+ __ul = __ll_lowpart(u); \
+ __uh = __ll_highpart(u); \
+ __vl = __ll_lowpart(v); \
+ __vh = __ll_highpart(v); \
+ \
+ __x0 = (UWtype) __ul * __vl; \
+ __x1 = (UWtype) __ul * __vh; \
+ __x2 = (UWtype) __uh * __vl; \
+ __x3 = (UWtype) __uh * __vh; \
+ \
+ __x1 += __ll_highpart(__x0); /* this can't give carry */\
+ __x1 += __x2; /* but this indeed can */ \
+ if (__x1 < __x2) /* did we get it? */ \
+ __x3 += __ll_B; /* yes, add it in the proper pos */ \
+ \
+ (w1) = __x3 + __ll_highpart(__x1); \
+ (w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0);\
+ } while (0)
+#endif
+
+#if !defined(__umulsidi3)
+#define __umulsidi3(u, v) ({ \
+ DWunion __w; \
+ umul_ppmm(__w.s.high, __w.s.low, u, v); \
+ __w.ll; \
+ })
+#endif
+
+DWtype __muldi3(DWtype u, DWtype v)
+{
+ const DWunion uu = {.ll = u};
+ const DWunion vv = {.ll = v};
+ DWunion w = {.ll = __umulsidi3(uu.s.low, vv.s.low)};
+
+ w.s.high += ((UWtype) uu.s.low * (UWtype) vv.s.high
+ + (UWtype) uu.s.high * (UWtype) vv.s.low);
+
+ return w.ll;
+}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 55d106b..211ca3f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,17 +185,16 @@ struct bootnode;
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
-extern int acpi_get_nodes(struct bootnode *physnodes);
+extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end);
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+#ifdef CONFIG_NUMA_EMU
extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
int num_nodes);
-#else
-static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
- int num_nodes)
-{
-}
#endif
+#endif /* CONFIG_ACPI_NUMA */
#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 6aee50d..64dc82e 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,16 +3,27 @@
#include <linux/pci.h>
+struct amd_nb_bus_dev_range {
+ u8 bus;
+ u8 dev_base;
+ u8 dev_limit;
+};
+
extern struct pci_device_id amd_nb_misc_ids[];
+extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
struct bootnode;
extern int early_is_amd_nb(u32 value);
extern int amd_cache_northbridges(void);
extern void amd_flush_garts(void);
-extern int amd_get_nodes(struct bootnode *nodes);
extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int amd_scan_nodes(void);
+#ifdef CONFIG_NUMA_EMU
+extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
+extern void amd_get_nodes(struct bootnode *nodes);
+#endif
+
struct amd_northbridge {
struct pci_dev *misc;
};
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 0141b23..4729b2b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
- __end_of_permanent_fixed_addresses,
-
#ifdef CONFIG_X86_MRST
FIX_LNW_VRTC,
#endif
+ __end_of_permanent_fixed_addresses,
+
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdf..91d915a 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
return __gpio_cansleep(gpio);
}
-/*
- * Not implemented, yet.
- */
static inline int gpio_to_irq(unsigned int gpio)
{
- return -ENOSYS;
+ return __gpio_to_irq(gpio);
}
static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f23eb25..ca242d3 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
DIE_TRAP,
DIE_GPF,
DIE_CALL,
- DIE_NMI_IPI,
DIE_PAGE_FAULT,
DIE_NMIUNKNOWN,
};
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f792060..72a8b52 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
#include <asm/mc146818rtc.h>
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
static inline unsigned char get_nmi_reason(void)
{
- return inb(0x61);
+ return inb(NMI_REASON_PORT);
}
static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c4021b9..c76f5b9 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -23,6 +23,26 @@ void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
+/*
+ * Define some priorities for the nmi notifier call chain.
+ *
+ * Create a local nmi bit that has a higher priority than
+ * external nmis, because the local ones are more frequent.
+ *
+ * Also setup some default high/normal/low settings for
+ * subsystems to registers with. Using 4 bits to seperate
+ * the priorities. This can go alot higher if needed be.
+ */
+
+#define NMI_LOCAL_SHIFT 16 /* randomly picked */
+#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
+#define NMI_HIGH_PRIOR (1ULL << 8)
+#define NMI_NORMAL_PRIOR (1ULL << 4)
+#define NMI_LOW_PRIOR (1ULL << 0)
+#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
+#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
+#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
+
void stop_nmi(void);
void restart_nmi(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070..5ae8728 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -38,7 +38,7 @@ extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
#endif /* CONFIG_NUMA_EMU */
#else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 295e2ff..e2f6a99 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
#define ARCH_P4_MAX_CCCR (18)
+#define ARCH_P4_CNTRVAL_BITS (40)
+#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
+
#define P4_ESCR_EVENT_MASK 0x7e000000U
#define P4_ESCR_EVENT_SHIFT 25
#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index affacb5..0a99f71 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,13 @@ struct pci_device_id amd_nb_misc_ids[] = {
};
EXPORT_SYMBOL(amd_nb_misc_ids);
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
struct amd_northbridge_info amd_northbridges;
EXPORT_SYMBOL(amd_northbridges);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index dcd7c83..5955a78 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-struct bus_dev_range {
- int bus;
- int dev_base;
- int dev_limit;
-};
-
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
- { 0x00, 0x18, 0x20},
- { 0xff, 0x00, 0x20},
- { 0xfe, 0x00, 0x20}
-};
-
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
@@ -294,13 +282,13 @@ void __init early_gart_iommu_check(void)
search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -349,13 +337,13 @@ void __init early_gart_iommu_check(void)
return;
/* disable them all at first */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -390,14 +378,14 @@ int __init gart_iommu_hole_init(void)
fix = 0;
node = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
u32 ctl;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
@@ -505,7 +493,7 @@ out:
}
/* Fix up the north bridges */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus, dev_base, dev_limit;
/*
@@ -514,9 +502,9 @@ out:
*/
u32 ctl = DISTLBWALKPRB | aper_order << 1;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a51345b..06c196d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -684,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 72ec29e..79fd43c 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -68,7 +68,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
default:
@@ -96,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
static __read_mostly struct notifier_block backtrace_notifier = {
.notifier_call = arch_trigger_all_cpu_backtrace_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ecca5f4..bd16b58 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -378,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
static __cpuinit void set_x2apic_extra_bits(int pnode)
{
- __this_cpu_write(x2apic_extra_bits, (pnode << 6));
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
}
/*
@@ -641,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
- if (reason != DIE_NMI_IPI)
+ if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7..a779719 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
struct die_args *args = (struct die_args *)data;
int cpu = smp_processor_id();
struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+ if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
return NOTIFY_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
static struct notifier_block mce_raise_nb = {
.notifier_call = mce_raise_notify,
- .priority = 1000,
+ .priority = NMI_LOCAL_NORMAL_PRIOR,
};
/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0492101..9d977a2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1267,7 +1267,6 @@ perf_event_nmi_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
case DIE_NMIUNKNOWN:
this_nmi = percpu_read(irq_stat.__nmi_count);
@@ -1317,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static struct event_constraint unconstrained;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b9..e56b9bf 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- int overflow = 0;
- u32 low, high;
+ u64 v;
- rdmsr(hwc->config_base + hwc->idx, low, high);
-
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
+ /* an official way for overflow indication */
+ rdmsrl(hwc->config_base + hwc->idx, v);
+ if (v & P4_CCCR_OVF) {
+ wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
+ return 1;
}
- return overflow;
+ /* it might be unflagged overflow */
+ rdmsrl(hwc->event_base + hwc->idx, v);
+ if (!(v & ARCH_P4_CNTRVAL_MASK))
+ return 1;
+
+ return 0;
}
static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
*/
.num_counters = ARCH_P4_MAX_CCCR,
.apic = 1,
- .cntval_bits = 40,
- .cntval_mask = (1ULL << 40) - 1,
- .max_period = (1ULL << 39) - 1,
+ .cntval_bits = ARCH_P4_CNTRVAL_BITS,
+ .cntval_mask = ARCH_P4_CNTRVAL_MASK,
+ .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8474c99..d6fb146 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -197,14 +197,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
*/
void dump_stack(void)
{
- unsigned long bp = 0;
unsigned long stack;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417..d3b895f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -299,17 +299,21 @@ ENDPROC(native_usergs_sysret64)
ENTRY(save_args)
XCPT_FRAME
cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ /*
+ * start from rbp in pt_regs and jump over
+ * return address.
+ */
+ movq_cfi rdi, RDI+8-RBP
+ movq_cfi rsi, RSI+8-RBP
+ movq_cfi rdx, RDX+8-RBP
+ movq_cfi rcx, RCX+8-RBP
+ movq_cfi rax, RAX+8-RBP
+ movq_cfi r8, R8+8-RBP
+ movq_cfi r9, R9+8-RBP
+ movq_cfi r10, R10+8-RBP
+ movq_cfi r11, R11+8-RBP
+
+ leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
movq_cfi rbp, 8 /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
@@ -782,8 +786,9 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $ORIG_RAX-ARGOFFSET+8, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
+ /* reserve pt_regs for scratch regs and rbp */
+ subq $ORIG_RAX-RBP, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
call save_args
PARTIAL_FRAME 0
call \func
@@ -808,9 +813,14 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+
CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
+
+ /* we did not save rbx, restore only from ARGOFFSET */
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index cd21b65..a413000 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
#include <asm/apicdef.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -525,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
}
return NOTIFY_DONE;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
-
case DIE_NMIUNKNOWN:
if (was_in_debug_nmi[raw_smp_processor_id()]) {
was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -606,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
/*
* Lowest-prio notifier priority, we want to be notified last:
*/
- .priority = -INT_MAX,
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
/**
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d..fc7aae1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
#include <asm/pci_x86.h>
#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
{
int cpu;
- if (val != DIE_NMI_IPI)
+ if (val != DIE_NMI)
return NOTIFY_OK;
cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
+ /* we want to be the first one called */
+ .priority = NMI_LOCAL_HIGH_PRIOR+1,
};
/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c7149c9..763df77 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
*/
static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-void cpu_hotplug_driver_lock()
+void cpu_hotplug_driver_lock(void)
{
mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
-void cpu_hotplug_driver_unlock()
+void cpu_hotplug_driver_unlock(void)
{
mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c76aaca..b9b6716 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -84,6 +84,11 @@ EXPORT_SYMBOL_GPL(used_vectors);
static int ignore_nmis;
int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -310,15 +315,15 @@ static int __init setup_unknown_nmi_panic(char *str)
__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
-
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ /*
+ * On some machines, PCI SERR line is used to report memory
+ * errors. EDAC makes use of it.
+ */
#if defined(CONFIG_EDAC)
if (edac_handler_set()) {
edac_atomic_assert_error();
@@ -329,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -341,15 +346,17 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
show_registers(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
/* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
i = 20000;
while (--i) {
@@ -357,8 +364,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
udelay(100);
}
- reason &= ~8;
- outb(reason, 0x61);
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -377,57 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
return;
}
#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
}
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
- int cpu;
- cpu = smp_processor_id();
-
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
+ raw_spin_lock(&nmi_reason_lock);
+ reason = get_nmi_reason();
-#ifdef CONFIG_X86_LOCAL_APIC
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
#endif
- unknown_nmi_error(reason, regs);
-
+ raw_spin_unlock(&nmi_reason_lock);
return;
}
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
+ raw_spin_unlock(&nmi_reason_lock);
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
- */
- reassert_nmi();
-#endif
+ unknown_nmi_error(reason, regs);
}
dotraplinkage notrace __kprobes void
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 03d2ea8..823f79a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -965,7 +965,7 @@ out:
static int __init init_tsc_clocksource(void)
{
- if (!cpu_has_tsc || tsc_disabled > 0)
+ if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
return 0;
if (tsc_clocksource_reliable)
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 08a0069..f21962c 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -27,6 +27,7 @@
#include <asm/amd_nb.h>
static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
static __init int find_northbridge(void)
@@ -68,19 +69,6 @@ static __init void early_get_boot_cpu_id(void)
#endif
}
-int __init amd_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long start = PFN_PHYS(start_pfn);
@@ -113,7 +101,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
@@ -193,6 +181,76 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
return 0;
}
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+void __init amd_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
+ }
+}
+
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment. For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality. node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base = 0;
+ int i;
+
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0)
+ apicid_base = boot_cpu_physical_apicid;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int index;
+ int nid;
+ int j;
+
+ nid = find_node_by_addr(nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+
+ index = nodeids[nid] << bits;
+ if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+ __acpi_map_pxm_to_node(nid, i);
+#endif
+ }
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
+
int __init amd_scan_nodes(void)
{
unsigned int bits;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7762a51..1e72102 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -260,30 +260,30 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata;
static int __init setup_physnodes(unsigned long start, unsigned long end,
int acpi, int amd)
{
- int nr_nodes = 0;
int ret = 0;
int i;
+ memset(physnodes, 0, sizeof(physnodes));
#ifdef CONFIG_ACPI_NUMA
if (acpi)
- nr_nodes = acpi_get_nodes(physnodes);
+ acpi_get_nodes(physnodes, start, end);
#endif
#ifdef CONFIG_AMD_NUMA
if (amd)
- nr_nodes = amd_get_nodes(physnodes);
+ amd_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
* if the SRAT or AMD code incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
- for (i = 0; i < nr_nodes; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
@@ -298,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
- }
-
- /*
- * Remove all nodes that have no memory or were truncated because of the
- * limited address range.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- physnodes[ret].start = physnodes[i].start;
- physnodes[ret].end = physnodes[i].end;
ret++;
}
@@ -324,6 +313,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
return ret;
}
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+ int i;
+
+ BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+ if (acpi)
+ acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_fake_nodes(nodes, nr_nodes);
+#endif
+ if (!acpi && !amd)
+ for (i = 0; i < nr_cpu_ids; i++)
+ numa_set_node(i, 0);
+}
+
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +359,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
- int nr_phys_nodes, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
@@ -384,7 +390,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
return -1;
}
- for (i = 0; i < nr_phys_nodes; i++)
+ for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
@@ -553,11 +559,9 @@ static int __init numa_emulation(unsigned long start_pfn,
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_phys_nodes;
int num_nodes;
int i;
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +576,7 @@ static int __init numa_emulation(unsigned long start_pfn,
unsigned long n;
n = simple_strtoul(cmdline, NULL, 0);
- num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+ num_nodes = split_nodes_interleave(addr, max_addr, n);
}
if (num_nodes < 0)
@@ -595,7 +599,8 @@ static int __init numa_emulation(unsigned long start_pfn,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
- acpi_fake_nodes(nodes, num_nodes);
+ setup_physnodes(addr, max_addr, acpi, amd);
+ fake_physnodes(acpi, amd, num_nodes);
numa_init_array();
return 0;
}
@@ -610,8 +615,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return;
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
@@ -767,6 +776,7 @@ void __cpuinit numa_clear_node(int cpu)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +786,115 @@ void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+ unsigned long addr;
+ u16 apicid;
+ int physnid;
+ int nid = NUMA_NO_NODE;
+
+ apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ if (apicid != BAD_APICID)
+ nid = apicid_to_node[apicid];
+ if (nid == NUMA_NO_NODE)
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ /*
+ * Use the starting address of the emulated node to find which physical
+ * node it is allocated on.
+ */
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ break;
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid) {
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+ }
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ char buf[64];
+
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return NULL;
+ }
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, buf);
+ return mask;
+}
/*
* --------- debug versions of the numa functions ---------
*/
+#ifndef CONFIG_NUMA_EMU
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
- char buf[64];
- mask = node_to_cpumask_map[node];
- if (mask == NULL) {
- printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
- dump_stack();
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
return;
- }
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
+}
+#else
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ int i;
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+ for_each_online_node(i) {
+ unsigned long addr;
+
+ addr = node_start_pfn(i) << PAGE_SHIFT;
+ if (addr < physnodes[node].start ||
+ addr >= physnodes[node].end)
+ continue;
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
+ return;
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+ }
}
+#endif /* CONFIG_NUMA_EMU */
void __cpuinit numa_add_cpu(int cpu)
{
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 171a0aa..603d285 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -349,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
+#ifdef CONFIG_NUMA_EMU
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end)
{
int i;
- int ret = 0;
for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
+ cutoff_node(i, start, end);
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
}
- return ret;
}
+#endif /* CONFIG_NUMA_EMU */
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -505,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
- "topology.\n");
for (i = 0; i < num_nodes; i++) {
int nid, pxm;
@@ -526,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
fake_apicid_to_node[j] == NUMA_NO_NODE)
fake_apicid_to_node[j] = i;
}
+
+ /*
+ * If there are apicid-to-node mappings for physical nodes that do not
+ * have a corresponding emulated node, it should default to a guaranteed
+ * value.
+ */
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ if (apicid_to_node[i] != NUMA_NO_NODE &&
+ fake_apicid_to_node[i] == NUMA_NO_NODE)
+ fake_apicid_to_node[i] = 0;
+
for (i = 0; i < num_nodes; i++)
__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f24a853..e2b7b0c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- case DIE_NMI_IPI:
if (ctr_running)
model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
else if (!nmi_enabled)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 2
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 0636dd9..720bf5a 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
static struct notifier_block profile_timer_exceptions_nb = {
.notifier_call = profile_timer_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = NMI_LOW_PRIOR,
};
static int timer_start(void)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe..e27dffb 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/range.h>
+#include <asm/amd_nb.h>
#include <asm/pci_x86.h>
#include <asm/pci-direct.h>
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
.notifier_call = amd_cpu_notify,
};
+static void __init pci_enable_pci_io_ecs(void)
+{
+#ifdef CONFIG_AMD_NB
+ unsigned int i, n;
+
+ for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
+ u8 bus = amd_nb_bus_dev_ranges[i].bus;
+ u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
+ u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (; slot < limit; ++slot) {
+ u32 val = read_pci_config(bus, slot, 3, 0);
+
+ if (!early_is_amd_nb(val))
+ continue;
+
+ val = read_pci_config(bus, slot, 3, 0x8c);
+ if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
+ val |= ENABLE_CF8_EXT_CFG >> 32;
+ write_pci_config(bus, slot, 3, 0x8c, val);
+ }
+ ++n;
+ }
+ }
+ pr_info("Extended Config Space enabled on %u nodes\n", n);
+#endif
+}
+
static int __init pci_io_ecs_init(void)
{
int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
if (boot_cpu_data.x86 < 0x10)
return 0;
+ /* Try the PCI method first. */
+ if (early_pci_allowed())
+ pci_enable_pci_io_ecs();
+
register_cpu_notifier(&amd_cpu_notifier);
for_each_online_cpu(cpu)
amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index f4d334f..320668f 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1081,7 +1081,7 @@ ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
{
struct die_args *args = data;
- if (val != DIE_NMI)
+ if (val != DIE_NMIUNKNOWN)
return NOTIFY_OK;
/* Hack, if it's a memory or I/O error, ignore it. */
diff --git a/drivers/mfd/sh_mobile_sdhi.c b/drivers/mfd/sh_mobile_sdhi.c
index f1714f9..0a7df44 100644
--- a/drivers/mfd/sh_mobile_sdhi.c
+++ b/drivers/mfd/sh_mobile_sdhi.c
@@ -131,11 +131,17 @@ static int __devinit sh_mobile_sdhi_probe(struct platform_device *pdev)
*/
mmc_data->flags |= TMIO_MMC_BLKSZ_2BYTES;
+ /*
+ * All SDHI blocks support SDIO IRQ signalling.
+ */
+ mmc_data->flags |= TMIO_MMC_SDIO_IRQ;
+
if (p && p->dma_slave_tx >= 0 && p->dma_slave_rx >= 0) {
priv->param_tx.slave_id = p->dma_slave_tx;
priv->param_rx.slave_id = p->dma_slave_rx;
priv->dma_priv.chan_priv_tx = &priv->param_tx;
priv->dma_priv.chan_priv_rx = &priv->param_rx;
+ priv->dma_priv.alignment_shift = 1; /* 2-byte alignment */
mmc_data->dma = &priv->dma_priv;
}
diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 57e4416..2a876c4 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -16,6 +16,7 @@ config MMC_BLOCK
config MMC_BLOCK_MINORS
int "Number of minors per block device"
+ depends on MMC_BLOCK
range 4 256
default 8
help
diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index bb22ffd..ef10387 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -16,3 +16,14 @@ config MMC_UNSAFE_RESUME
This option sets a default which can be overridden by the
module parameter "removable=0" or "removable=1".
+
+config MMC_CLKGATE
+ bool "MMC host clock gating (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ This will attempt to aggressively gate the clock to the MMC card.
+ This is done to save power due to gating off the logic and bus
+ noise when the MMC card is not in use. Your host driver has to
+ support handling this in order for it to be of any use.
+
+ If unsure, say N.
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index af8dc6a..63667a8 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -303,14 +303,14 @@ int mmc_add_card(struct mmc_card *card)
type, card->rca);
}
- ret = device_add(&card->dev);
- if (ret)
- return ret;
-
#ifdef CONFIG_DEBUG_FS
mmc_add_card_debugfs(card);
#endif
+ ret = device_add(&card->dev);
+ if (ret)
+ return ret;
+
mmc_card_set_present(card);
return 0;
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index a3a780f..6625c05 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -22,6 +22,7 @@
#include <linux/scatterlist.h>
#include <linux/log2.h>
#include <linux/regulator/consumer.h>
+#include <linux/pm_runtime.h>
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
@@ -130,6 +131,8 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
if (mrq->done)
mrq->done(mrq);
+
+ mmc_host_clk_gate(host);
}
}
@@ -190,6 +193,7 @@ mmc_start_request(struct mmc_host *host, struct mmc_request *mrq)
mrq->stop->mrq = mrq;
}
}
+ mmc_host_clk_ungate(host);
host->ops->request(host, mrq);
}
@@ -295,8 +299,9 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
unsigned int timeout_us, limit_us;
timeout_us = data->timeout_ns / 1000;
- timeout_us += data->timeout_clks * 1000 /
- (card->host->ios.clock / 1000);
+ if (mmc_host_clk_rate(card->host))
+ timeout_us += data->timeout_clks * 1000 /
+ (mmc_host_clk_rate(card->host) / 1000);
if (data->flags & MMC_DATA_WRITE)
/*
@@ -614,6 +619,8 @@ static inline void mmc_set_ios(struct mmc_host *host)
ios->power_mode, ios->chip_select, ios->vdd,
ios->bus_width, ios->timing);
+ if (ios->clock > 0)
+ mmc_set_ungated(host);
host->ops->set_ios(host, ios);
}
@@ -641,6 +648,61 @@ void mmc_set_clock(struct mmc_host *host, unsigned int hz)
mmc_set_ios(host);
}
+#ifdef CONFIG_MMC_CLKGATE
+/*
+ * This gates the clock by setting it to 0 Hz.
+ */
+void mmc_gate_clock(struct mmc_host *host)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&host->clk_lock, flags);
+ host->clk_old = host->ios.clock;
+ host->ios.clock = 0;
+ host->clk_gated = true;
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ mmc_set_ios(host);
+}
+
+/*
+ * This restores the clock from gating by using the cached
+ * clock value.
+ */
+void mmc_ungate_clock(struct mmc_host *host)
+{
+ /*
+ * We should previously have gated the clock, so the clock shall
+ * be 0 here! The clock may however be 0 during initialization,
+ * when some request operations are performed before setting
+ * the frequency. When ungate is requested in that situation
+ * we just ignore the call.
+ */
+ if (host->clk_old) {
+ BUG_ON(host->ios.clock);
+ /* This call will also set host->clk_gated to false */
+ mmc_set_clock(host, host->clk_old);
+ }
+}
+
+void mmc_set_ungated(struct mmc_host *host)
+{
+ unsigned long flags;
+
+ /*
+ * We've been given a new frequency while the clock is gated,
+ * so make sure we regard this as ungating it.
+ */
+ spin_lock_irqsave(&host->clk_lock, flags);
+ host->clk_gated = false;
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+}
+
+#else
+void mmc_set_ungated(struct mmc_host *host)
+{
+}
+#endif
+
/*
* Change the bus mode (open drain/push-pull) of a host.
*/
@@ -1424,35 +1486,57 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen)
}
EXPORT_SYMBOL(mmc_set_blocklen);
+static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
+{
+ host->f_init = freq;
+
+#ifdef CONFIG_MMC_DEBUG
+ pr_info("%s: %s: trying to init card at %u Hz\n",
+ mmc_hostname(host), __func__, host->f_init);
+#endif
+ mmc_power_up(host);
+ sdio_reset(host);
+ mmc_go_idle(host);
+
+ mmc_send_if_cond(host, host->ocr_avail);
+
+ /* Order's important: probe SDIO, then SD, then MMC */
+ if (!mmc_attach_sdio(host))
+ return 0;
+ if (!mmc_attach_sd(host))
+ return 0;
+ if (!mmc_attach_mmc(host))
+ return 0;
+
+ mmc_power_off(host);
+ return -EIO;
+}
+
void mmc_rescan(struct work_struct *work)
{
+ static const unsigned freqs[] = { 400000, 300000, 200000, 100000 };
struct mmc_host *host =
container_of(work, struct mmc_host, detect.work);
- u32 ocr;
- int err;
- unsigned long flags;
int i;
- const unsigned freqs[] = { 400000, 300000, 200000, 100000 };
-
- spin_lock_irqsave(&host->lock, flags);
- if (host->rescan_disable) {
- spin_unlock_irqrestore(&host->lock, flags);
+ if (host->rescan_disable)
return;
- }
-
- spin_unlock_irqrestore(&host->lock, flags);
-
mmc_bus_get(host);
- /* if there is a card registered, check whether it is still present */
- if ((host->bus_ops != NULL) && host->bus_ops->detect && !host->bus_dead)
+ /*
+ * if there is a _removable_ card registered, check whether it is
+ * still present
+ */
+ if (host->bus_ops && host->bus_ops->detect && !host->bus_dead
+ && mmc_card_is_removable(host))
host->bus_ops->detect(host);
+ /*
+ * Let mmc_bus_put() free the bus/bus_ops if we've found that
+ * the card is no longer present.
+ */
mmc_bus_put(host);
-
-
mmc_bus_get(host);
/* if there still is a card present, stop here */
@@ -1461,8 +1545,6 @@ void mmc_rescan(struct work_struct *work)
goto out;
}
- /* detect a newly inserted card */
-
/*
* Only we can add a new handler, so it's safe to
* release the lock here.
@@ -1472,72 +1554,16 @@ void mmc_rescan(struct work_struct *work)
if (host->ops->get_cd && host->ops->get_cd(host) == 0)
goto out;
+ mmc_claim_host(host);
for (i = 0; i < ARRAY_SIZE(freqs); i++) {
- mmc_claim_host(host);
-
- if (freqs[i] >= host->f_min)
- host->f_init = freqs[i];
- else if (!i || freqs[i-1] > host->f_min)
- host->f_init = host->f_min;
- else {
- mmc_release_host(host);
- goto out;
- }
-#ifdef CONFIG_MMC_DEBUG
- pr_info("%s: %s: trying to init card at %u Hz\n",
- mmc_hostname(host), __func__, host->f_init);
-#endif
- mmc_power_up(host);
- sdio_reset(host);
- mmc_go_idle(host);
-
- mmc_send_if_cond(host, host->ocr_avail);
-
- /*
- * First we search for SDIO...
- */
- err = mmc_send_io_op_cond(host, 0, &ocr);
- if (!err) {
- if (mmc_attach_sdio(host, ocr)) {
- mmc_claim_host(host);
- /*
- * Try SDMEM (but not MMC) even if SDIO
- * is broken.
- */
- if (mmc_send_app_op_cond(host, 0, &ocr))
- goto out_fail;
-
- if (mmc_attach_sd(host, ocr))
- mmc_power_off(host);
- }
- goto out;
- }
-
- /*
- * ...then normal SD...
- */
- err = mmc_send_app_op_cond(host, 0, &ocr);
- if (!err) {
- if (mmc_attach_sd(host, ocr))
- mmc_power_off(host);
- goto out;
- }
-
- /*
- * ...and finally MMC.
- */
- err = mmc_send_op_cond(host, 0, &ocr);
- if (!err) {
- if (mmc_attach_mmc(host, ocr))
- mmc_power_off(host);
- goto out;
- }
-
-out_fail:
- mmc_release_host(host);
- mmc_power_off(host);
+ if (!mmc_rescan_try_freq(host, max(freqs[i], host->f_min)))
+ break;
+ if (freqs[i] < host->f_min)
+ break;
}
-out:
+ mmc_release_host(host);
+
+ out:
if (host->caps & MMC_CAP_NEEDS_POLL)
mmc_schedule_delayed_work(&host->detect, HZ);
}
@@ -1721,6 +1747,18 @@ int mmc_resume_host(struct mmc_host *host)
if (!(host->pm_flags & MMC_PM_KEEP_POWER)) {
mmc_power_up(host);
mmc_select_voltage(host, host->ocr);
+ /*
+ * Tell runtime PM core we just powered up the card,
+ * since it still believes the card is powered off.
+ * Note that currently runtime PM is only enabled
+ * for SDIO cards that are MMC_CAP_POWER_OFF_CARD
+ */
+ if (mmc_card_sdio(host->card) &&
+ (host->caps & MMC_CAP_POWER_OFF_CARD)) {
+ pm_runtime_disable(&host->card->dev);
+ pm_runtime_set_active(&host->card->dev);
+ pm_runtime_enable(&host->card->dev);
+ }
}
BUG_ON(!host->bus_ops->resume);
err = host->bus_ops->resume(host);
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 77240cd..ca1fdde 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -33,6 +33,9 @@ void mmc_init_erase(struct mmc_card *card);
void mmc_set_chip_select(struct mmc_host *host, int mode);
void mmc_set_clock(struct mmc_host *host, unsigned int hz);
+void mmc_gate_clock(struct mmc_host *host);
+void mmc_ungate_clock(struct mmc_host *host);
+void mmc_set_ungated(struct mmc_host *host);
void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode);
void mmc_set_bus_width(struct mmc_host *host, unsigned int width);
void mmc_set_bus_width_ddr(struct mmc_host *host, unsigned int width,
@@ -54,9 +57,9 @@ void mmc_rescan(struct work_struct *work);
void mmc_start_host(struct mmc_host *host);
void mmc_stop_host(struct mmc_host *host);
-int mmc_attach_mmc(struct mmc_host *host, u32 ocr);
-int mmc_attach_sd(struct mmc_host *host, u32 ocr);
-int mmc_attach_sdio(struct mmc_host *host, u32 ocr);
+int mmc_attach_mmc(struct mmc_host *host);
+int mmc_attach_sd(struct mmc_host *host);
+int mmc_attach_sdio(struct mmc_host *host);
/* Module parameters */
extern int use_spi_crc;
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index eed1405..998797e 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -183,6 +183,11 @@ void mmc_add_host_debugfs(struct mmc_host *host)
&mmc_clock_fops))
goto err_node;
+#ifdef CONFIG_MMC_CLKGATE
+ if (!debugfs_create_u32("clk_delay", (S_IRUSR | S_IWUSR),
+ root, &host->clk_delay))
+ goto err_node;
+#endif
return;
err_node:
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 10b8af2..b3ac6c5 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -3,6 +3,7 @@
*
* Copyright (C) 2003 Russell King, All Rights Reserved.
* Copyright (C) 2007-2008 Pierre Ossman
+ * Copyright (C) 2010 Linus Walleij
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,6 +21,7 @@
#include <linux/suspend.h>
#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
#include "core.h"
#include "host.h"
@@ -50,6 +52,205 @@ void mmc_unregister_host_class(void)
static DEFINE_IDR(mmc_host_idr);
static DEFINE_SPINLOCK(mmc_host_lock);
+#ifdef CONFIG_MMC_CLKGATE
+
+/*
+ * Enabling clock gating will make the core call out to the host
+ * once up and once down when it performs a request or card operation
+ * intermingled in any fashion. The driver will see this through
+ * set_ios() operations with ios.clock field set to 0 to gate (disable)
+ * the block clock, and to the old frequency to enable it again.
+ */
+static void mmc_host_clk_gate_delayed(struct mmc_host *host)
+{
+ unsigned long tick_ns;
+ unsigned long freq = host->ios.clock;
+ unsigned long flags;
+
+ if (!freq) {
+ pr_debug("%s: frequency set to 0 in disable function, "
+ "this means the clock is already disabled.\n",
+ mmc_hostname(host));
+ return;
+ }
+ /*
+ * New requests may have appeared while we were scheduling,
+ * then there is no reason to delay the check before
+ * clk_disable().
+ */
+ spin_lock_irqsave(&host->clk_lock, flags);
+
+ /*
+ * Delay n bus cycles (at least 8 from MMC spec) before attempting
+ * to disable the MCI block clock. The reference count may have
+ * gone up again after this delay due to rescheduling!
+ */
+ if (!host->clk_requests) {
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ tick_ns = DIV_ROUND_UP(1000000000, freq);
+ ndelay(host->clk_delay * tick_ns);
+ } else {
+ /* New users appeared while waiting for this work */
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ return;
+ }
+ mutex_lock(&host->clk_gate_mutex);
+ spin_lock_irqsave(&host->clk_lock, flags);
+ if (!host->clk_requests) {
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ /* This will set host->ios.clock to 0 */
+ mmc_gate_clock(host);
+ spin_lock_irqsave(&host->clk_lock, flags);
+ pr_debug("%s: gated MCI clock\n", mmc_hostname(host));
+ }
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ mutex_unlock(&host->clk_gate_mutex);
+}
+
+/*
+ * Internal work. Work to disable the clock at some later point.
+ */
+static void mmc_host_clk_gate_work(struct work_struct *work)
+{
+ struct mmc_host *host = container_of(work, struct mmc_host,
+ clk_gate_work);
+
+ mmc_host_clk_gate_delayed(host);
+}
+
+/**
+ * mmc_host_clk_ungate - ungate hardware MCI clocks
+ * @host: host to ungate.
+ *
+ * Makes sure the host ios.clock is restored to a non-zero value
+ * past this call. Increase clock reference count and ungate clock
+ * if we're the first user.
+ */
+void mmc_host_clk_ungate(struct mmc_host *host)
+{
+ unsigned long flags;
+
+ mutex_lock(&host->clk_gate_mutex);
+ spin_lock_irqsave(&host->clk_lock, flags);
+ if (host->clk_gated) {
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ mmc_ungate_clock(host);
+ spin_lock_irqsave(&host->clk_lock, flags);
+ pr_debug("%s: ungated MCI clock\n", mmc_hostname(host));
+ }
+ host->clk_requests++;
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ mutex_unlock(&host->clk_gate_mutex);
+}
+
+/**
+ * mmc_host_may_gate_card - check if this card may be gated
+ * @card: card to check.
+ */
+static bool mmc_host_may_gate_card(struct mmc_card *card)
+{
+ /* If there is no card we may gate it */
+ if (!card)
+ return true;
+ /*
+ * Don't gate SDIO cards! These need to be clocked at all times
+ * since they may be independent systems generating interrupts
+ * and other events. The clock requests counter from the core will
+ * go down to zero since the core does not need it, but we will not
+ * gate the clock, because there is somebody out there that may still
+ * be using it.
+ */
+ if (mmc_card_sdio(card))
+ return false;
+
+ return true;
+}
+
+/**
+ * mmc_host_clk_gate - gate off hardware MCI clocks
+ * @host: host to gate.
+ *
+ * Calls the host driver with ios.clock set to zero as often as possible
+ * in order to gate off hardware MCI clocks. Decrease clock reference
+ * count and schedule disabling of clock.
+ */
+void mmc_host_clk_gate(struct mmc_host *host)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&host->clk_lock, flags);
+ host->clk_requests--;
+ if (mmc_host_may_gate_card(host->card) &&
+ !host->clk_requests)
+ schedule_work(&host->clk_gate_work);
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+}
+
+/**
+ * mmc_host_clk_rate - get current clock frequency setting
+ * @host: host to get the clock frequency for.
+ *
+ * Returns current clock frequency regardless of gating.
+ */
+unsigned int mmc_host_clk_rate(struct mmc_host *host)
+{
+ unsigned long freq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&host->clk_lock, flags);
+ if (host->clk_gated)
+ freq = host->clk_old;
+ else
+ freq = host->ios.clock;
+ spin_unlock_irqrestore(&host->clk_lock, flags);
+ return freq;
+}
+
+/**
+ * mmc_host_clk_init - set up clock gating code
+ * @host: host with potential clock to control
+ */
+static inline void mmc_host_clk_init(struct mmc_host *host)
+{
+ host->clk_requests = 0;
+ /* Hold MCI clock for 8 cycles by default */
+ host->clk_delay = 8;
+ host->clk_gated = false;
+ INIT_WORK(&host->clk_gate_work, mmc_host_clk_gate_work);
+ spin_lock_init(&host->clk_lock);
+ mutex_init(&host->clk_gate_mutex);
+}
+
+/**
+ * mmc_host_clk_exit - shut down clock gating code
+ * @host: host with potential clock to control
+ */
+static inline void mmc_host_clk_exit(struct mmc_host *host)
+{
+ /*
+ * Wait for any outstanding gate and then make sure we're
+ * ungated before exiting.
+ */
+ if (cancel_work_sync(&host->clk_gate_work))
+ mmc_host_clk_gate_delayed(host);
+ if (host->clk_gated)
+ mmc_host_clk_ungate(host);
+ /* There should be only one user now */
+ WARN_ON(host->clk_requests > 1);
+}
+
+#else
+
+static inline void mmc_host_clk_init(struct mmc_host *host)
+{
+}
+
+static inline void mmc_host_clk_exit(struct mmc_host *host)
+{
+}
+
+#endif
+
/**
* mmc_alloc_host - initialise the per-host structure.
* @extra: sizeof private data structure
@@ -82,6 +283,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
host->class_dev.class = &mmc_host_class;
device_initialize(&host->class_dev);
+ mmc_host_clk_init(host);
+
spin_lock_init(&host->lock);
init_waitqueue_head(&host->wq);
INIT_DELAYED_WORK(&host->detect, mmc_rescan);
@@ -163,6 +366,8 @@ void mmc_remove_host(struct mmc_host *host)
device_del(&host->class_dev);
led_trigger_unregister_simple(host->led);
+
+ mmc_host_clk_exit(host);
}
EXPORT_SYMBOL(mmc_remove_host);
@@ -183,4 +388,3 @@ void mmc_free_host(struct mmc_host *host)
}
EXPORT_SYMBOL(mmc_free_host);
-
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 8c87e11..de199f9 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -10,10 +10,31 @@
*/
#ifndef _MMC_CORE_HOST_H
#define _MMC_CORE_HOST_H
+#include <linux/mmc/host.h>
int mmc_register_host_class(void);
void mmc_unregister_host_class(void);
+#ifdef CONFIG_MMC_CLKGATE
+void mmc_host_clk_ungate(struct mmc_host *host);
+void mmc_host_clk_gate(struct mmc_host *host);
+unsigned int mmc_host_clk_rate(struct mmc_host *host);
+
+#else
+static inline void mmc_host_clk_ungate(struct mmc_host *host)
+{
+}
+
+static inline void mmc_host_clk_gate(struct mmc_host *host)
+{
+}
+
+static inline unsigned int mmc_host_clk_rate(struct mmc_host *host)
+{
+ return host->ios.clock;
+}
+#endif
+
void mmc_host_deeper_disable(struct work_struct *work);
#endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 77f93c3..16006ef 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -534,39 +534,57 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
*/
if ((card->csd.mmca_vsn >= CSD_SPEC_VER_4) &&
(host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) {
- unsigned ext_csd_bit, bus_width;
-
- if (host->caps & MMC_CAP_8_BIT_DATA) {
- if (ddr)
- ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_8;
- else
- ext_csd_bit = EXT_CSD_BUS_WIDTH_8;
- bus_width = MMC_BUS_WIDTH_8;
- } else {
- if (ddr)
- ext_csd_bit = EXT_CSD_DDR_BUS_WIDTH_4;
- else
- ext_csd_bit = EXT_CSD_BUS_WIDTH_4;
- bus_width = MMC_BUS_WIDTH_4;
+ static unsigned ext_csd_bits[][2] = {
+ { EXT_CSD_BUS_WIDTH_8, EXT_CSD_DDR_BUS_WIDTH_8 },
+ { EXT_CSD_BUS_WIDTH_4, EXT_CSD_DDR_BUS_WIDTH_4 },
+ { EXT_CSD_BUS_WIDTH_1, EXT_CSD_BUS_WIDTH_1 },
+ };
+ static unsigned bus_widths[] = {
+ MMC_BUS_WIDTH_8,
+ MMC_BUS_WIDTH_4,
+ MMC_BUS_WIDTH_1
+ };
+ unsigned idx, bus_width = 0;
+
+ if (host->caps & MMC_CAP_8_BIT_DATA)
+ idx = 0;
+ else
+ idx = 1;
+ for (; idx < ARRAY_SIZE(bus_widths); idx++) {
+ bus_width = bus_widths[idx];
+ if (bus_width == MMC_BUS_WIDTH_1)
+ ddr = 0; /* no DDR for 1-bit width */
+ err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+ EXT_CSD_BUS_WIDTH,
+ ext_csd_bits[idx][0]);
+ if (!err) {
+ mmc_set_bus_width_ddr(card->host,
+ bus_width, MMC_SDR_MODE);
+ /*
+ * If controller can't handle bus width test,
+ * use the highest bus width to maintain
+ * compatibility with previous MMC behavior.
+ */
+ if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
+ break;
+ err = mmc_bus_test(card, bus_width);
+ if (!err)
+ break;
+ }
}
- err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
- EXT_CSD_BUS_WIDTH, ext_csd_bit);
-
- if (err && err != -EBADMSG)
- goto free_card;
-
+ if (!err && ddr) {
+ err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
+ EXT_CSD_BUS_WIDTH,
+ ext_csd_bits[idx][1]);
+ }
if (err) {
printk(KERN_WARNING "%s: switch to bus width %d ddr %d "
- "failed\n", mmc_hostname(card->host),
- 1 << bus_width, ddr);
- err = 0;
- } else {
- if (ddr)
- mmc_card_set_ddr_mode(card);
- else
- ddr = MMC_SDR_MODE;
-
+ "failed\n", mmc_hostname(card->host),
+ 1 << bus_width, ddr);
+ goto free_card;
+ } else if (ddr) {
+ mmc_card_set_ddr_mode(card);
mmc_set_bus_width_ddr(card->host, bus_width, ddr);
}
}
@@ -737,14 +755,21 @@ static void mmc_attach_bus_ops(struct mmc_host *host)
/*
* Starting point for MMC card init.
*/
-int mmc_attach_mmc(struct mmc_host *host, u32 ocr)
+int mmc_attach_mmc(struct mmc_host *host)
{
int err;
+ u32 ocr;
BUG_ON(!host);
WARN_ON(!host->claimed);
+ err = mmc_send_op_cond(host, 0, &ocr);
+ if (err)
+ return err;
+
mmc_attach_bus_ops(host);
+ if (host->ocr_avail_mmc)
+ host->ocr_avail = host->ocr_avail_mmc;
/*
* We need to get OCR a different way for SPI.
@@ -784,20 +809,20 @@ int mmc_attach_mmc(struct mmc_host *host, u32 ocr)
goto err;
mmc_release_host(host);
-
err = mmc_add_card(host->card);
+ mmc_claim_host(host);
if (err)
goto remove_card;
return 0;
remove_card:
+ mmc_release_host(host);
mmc_remove_card(host->card);
- host->card = NULL;
mmc_claim_host(host);
+ host->card = NULL;
err:
mmc_detach_bus(host);
- mmc_release_host(host);
printk(KERN_ERR "%s: error %d whilst initialising MMC card\n",
mmc_hostname(host), err);
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 326447c..60842f8 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -462,3 +462,104 @@ int mmc_send_status(struct mmc_card *card, u32 *status)
return 0;
}
+static int
+mmc_send_bus_test(struct mmc_card *card, struct mmc_host *host, u8 opcode,
+ u8 len)
+{
+ struct mmc_request mrq;
+ struct mmc_command cmd;
+ struct mmc_data data;
+ struct scatterlist sg;
+ u8 *data_buf;
+ u8 *test_buf;
+ int i, err;
+ static u8 testdata_8bit[8] = { 0x55, 0xaa, 0, 0, 0, 0, 0, 0 };
+ static u8 testdata_4bit[4] = { 0x5a, 0, 0, 0 };
+
+ /* dma onto stack is unsafe/nonportable, but callers to this
+ * routine normally provide temporary on-stack buffers ...
+ */
+ data_buf = kmalloc(len, GFP_KERNEL);
+ if (!data_buf)
+ return -ENOMEM;
+
+ if (len == 8)
+ test_buf = testdata_8bit;
+ else if (len == 4)
+ test_buf = testdata_4bit;
+ else {
+ printk(KERN_ERR "%s: Invalid bus_width %d\n",
+ mmc_hostname(host), len);
+ kfree(data_buf);
+ return -EINVAL;
+ }
+
+ if (opcode == MMC_BUS_TEST_W)
+ memcpy(data_buf, test_buf, len);
+
+ memset(&mrq, 0, sizeof(struct mmc_request));
+ memset(&cmd, 0, sizeof(struct mmc_command));
+ memset(&data, 0, sizeof(struct mmc_data));
+
+ mrq.cmd = &cmd;
+ mrq.data = &data;
+ cmd.opcode = opcode;
+ cmd.arg = 0;
+
+ /* NOTE HACK: the MMC_RSP_SPI_R1 is always correct here, but we
+ * rely on callers to never use this with "native" calls for reading
+ * CSD or CID. Native versions of those commands use the R2 type,
+ * not R1 plus a data block.
+ */
+ cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
+
+ data.blksz = len;
+ data.blocks = 1;
+ if (opcode == MMC_BUS_TEST_R)
+ data.flags = MMC_DATA_READ;
+ else
+ data.flags = MMC_DATA_WRITE;
+
+ data.sg = &sg;
+ data.sg_len = 1;
+ sg_init_one(&sg, data_buf, len);
+ mmc_wait_for_req(host, &mrq);
+ err = 0;
+ if (opcode == MMC_BUS_TEST_R) {
+ for (i = 0; i < len / 4; i++)
+ if ((test_buf[i] ^ data_buf[i]) != 0xff) {
+ err = -EIO;
+ break;
+ }
+ }
+ kfree(data_buf);
+
+ if (cmd.error)
+ return cmd.error;
+ if (data.error)
+ return data.error;
+
+ return err;
+}
+
+int mmc_bus_test(struct mmc_card *card, u8 bus_width)
+{
+ int err, width;
+
+ if (bus_width == MMC_BUS_WIDTH_8)
+ width = 8;
+ else if (bus_width == MMC_BUS_WIDTH_4)
+ width = 4;
+ else if (bus_width == MMC_BUS_WIDTH_1)
+ return 0; /* no need for test */
+ else
+ return -EINVAL;
+
+ /*
+ * Ignore errors from BUS_TEST_W. BUS_TEST_R will fail if there
+ * is a problem. This improves chances that the test will work.
+ */
+ mmc_send_bus_test(card, card->host, MMC_BUS_TEST_W, width);
+ err = mmc_send_bus_test(card, card->host, MMC_BUS_TEST_R, width);
+ return err;
+}
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index 653eb8e..e6d44b8 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -26,6 +26,7 @@ int mmc_send_cid(struct mmc_host *host, u32 *cid);
int mmc_spi_read_ocr(struct mmc_host *host, int highcap, u32 *ocrp);
int mmc_spi_set_crc(struct mmc_host *host, int use_crc);
int mmc_card_sleepawake(struct mmc_host *host, int sleep);
+int mmc_bus_test(struct mmc_card *card, u8 bus_width);
#endif
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 49da4df..d18c32b 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -764,14 +764,21 @@ static void mmc_sd_attach_bus_ops(struct mmc_host *host)
/*
* Starting point for SD card init.
*/
-int mmc_attach_sd(struct mmc_host *host, u32 ocr)
+int mmc_attach_sd(struct mmc_host *host)
{
int err;
+ u32 ocr;
BUG_ON(!host);
WARN_ON(!host->claimed);
+ err = mmc_send_app_op_cond(host, 0, &ocr);
+ if (err)
+ return err;
+
mmc_sd_attach_bus_ops(host);
+ if (host->ocr_avail_sd)
+ host->ocr_avail = host->ocr_avail_sd;
/*
* We need to get OCR a different way for SPI.
@@ -795,7 +802,8 @@ int mmc_attach_sd(struct mmc_host *host, u32 ocr)
ocr &= ~0x7F;
}
- if (ocr & MMC_VDD_165_195) {
+ if ((ocr & MMC_VDD_165_195) &&
+ !(host->ocr_avail_sd & MMC_VDD_165_195)) {
printk(KERN_WARNING "%s: SD card claims to support the "
"incompletely defined 'low voltage range'. This "
"will be ignored.\n", mmc_hostname(host));
@@ -820,20 +828,20 @@ int mmc_attach_sd(struct mmc_host *host, u32 ocr)
goto err;
mmc_release_host(host);
-
err = mmc_add_card(host->card);
+ mmc_claim_host(host);
if (err)
goto remove_card;
return 0;
remove_card:
+ mmc_release_host(host);
mmc_remove_card(host->card);
host->card = NULL;
mmc_claim_host(host);
err:
mmc_detach_bus(host);
- mmc_release_host(host);
printk(KERN_ERR "%s: error %d whilst initialising SD card\n",
mmc_hostname(host), err);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index efef5f9..5c4a54d 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -627,15 +627,27 @@ static int mmc_sdio_suspend(struct mmc_host *host)
static int mmc_sdio_resume(struct mmc_host *host)
{
- int i, err;
+ int i, err = 0;
BUG_ON(!host);
BUG_ON(!host->card);
/* Basic card reinitialization. */
mmc_claim_host(host);
- err = mmc_sdio_init_card(host, host->ocr, host->card,
+
+ /* No need to reinitialize powered-resumed nonremovable cards */
+ if (mmc_card_is_removable(host) || !mmc_card_is_powered_resumed(host))
+ err = mmc_sdio_init_card(host, host->ocr, host->card,
(host->pm_flags & MMC_PM_KEEP_POWER));
+ else if (mmc_card_is_powered_resumed(host)) {
+ /* We may have switched to 1-bit mode during suspend */
+ err = sdio_enable_4bit_bus(host->card);
+ if (err > 0) {
+ mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
+ err = 0;
+ }
+ }
+
if (!err && host->sdio_irqs)
mmc_signal_sdio_irq(host);
mmc_release_host(host);
@@ -690,16 +702,22 @@ static const struct mmc_bus_ops mmc_sdio_ops = {
/*
* Starting point for SDIO card init.
*/
-int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
+int mmc_attach_sdio(struct mmc_host *host)
{
- int err;
- int i, funcs;
+ int err, i, funcs;
+ u32 ocr;
struct mmc_card *card;
BUG_ON(!host);
WARN_ON(!host->claimed);
+ err = mmc_send_io_op_cond(host, 0, &ocr);
+ if (err)
+ return err;
+
mmc_attach_bus(host, &mmc_sdio_ops);
+ if (host->ocr_avail_sdio)
+ host->ocr_avail = host->ocr_avail_sdio;
/*
* Sanity check the voltages that the card claims to
@@ -769,12 +787,12 @@ int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
pm_runtime_enable(&card->sdio_func[i]->dev);
}
- mmc_release_host(host);
-
/*
* First add the card to the driver model...
*/
+ mmc_release_host(host);
err = mmc_add_card(host->card);
+ mmc_claim_host(host);
if (err)
goto remove_added;
@@ -792,15 +810,17 @@ int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
remove_added:
/* Remove without lock if the device has been added. */
+ mmc_release_host(host);
mmc_sdio_remove(host);
mmc_claim_host(host);
remove:
/* And with lock if it hasn't been added. */
+ mmc_release_host(host);
if (host->card)
mmc_sdio_remove(host);
+ mmc_claim_host(host);
err:
mmc_detach_bus(host);
- mmc_release_host(host);
printk(KERN_ERR "%s: error %d whilst initialising SDIO card\n",
mmc_hostname(host), err);
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 203da44..d29b9c3 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -197,44 +197,12 @@ out:
#ifdef CONFIG_PM_RUNTIME
-static int sdio_bus_pm_prepare(struct device *dev)
-{
- struct sdio_func *func = dev_to_sdio_func(dev);
-
- /*
- * Resume an SDIO device which was suspended at run time at this
- * point, in order to allow standard SDIO suspend/resume paths
- * to keep working as usual.
- *
- * Ultimately, the SDIO driver itself will decide (in its
- * suspend handler, or lack thereof) whether the card should be
- * removed or kept, and if kept, at what power state.
- *
- * At this point, PM core have increased our use count, so it's
- * safe to directly resume the device. After system is resumed
- * again, PM core will drop back its runtime PM use count, and if
- * needed device will be suspended again.
- *
- * The end result is guaranteed to be a power state that is
- * coherent with the device's runtime PM use count.
- *
- * The return value of pm_runtime_resume is deliberately unchecked
- * since there is little point in failing system suspend if a
- * device can't be resumed.
- */
- if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD)
- pm_runtime_resume(dev);
-
- return 0;
-}
-
static const struct dev_pm_ops sdio_bus_pm_ops = {
SET_RUNTIME_PM_OPS(
pm_generic_runtime_suspend,
pm_generic_runtime_resume,
pm_generic_runtime_idle
)
- .prepare = sdio_bus_pm_prepare,
};
#define SDIO_PM_OPS_PTR (&sdio_bus_pm_ops)
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index e960a93..c22a4c0 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -142,6 +142,27 @@ config MMC_SDHCI_ESDHC_IMX
If unsure, say N.
+config MMC_SDHCI_DOVE
+ bool "SDHCI support on Marvell's Dove SoC"
+ depends on ARCH_DOVE
+ depends on MMC_SDHCI_PLTFM
+ select MMC_SDHCI_IO_ACCESSORS
+ help
+ This selects the Secure Digital Host Controller Interface in
+ Marvell's Dove SoC.
+
+ If unsure, say N.
+
+config MMC_SDHCI_TEGRA
+ tristate "SDHCI platform support for the Tegra SD/MMC Controller"
+ depends on MMC_SDHCI_PLTFM && ARCH_TEGRA
+ select MMC_SDHCI_IO_ACCESSORS
+ help
+ This selects the Tegra SD/MMC controller. If you have a Tegra
+ platform with SD or MMC devices, say Y or M here.
+
+ If unsure, say N.
+
config MMC_SDHCI_S3C
tristate "SDHCI support on Samsung S3C SoC"
depends on MMC_SDHCI && PLAT_SAMSUNG
@@ -460,6 +481,22 @@ config SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND
help
If you say yes here SD-Cards may work on the EZkit.
+config MMC_DW
+ tristate "Synopsys DesignWare Memory Card Interface"
+ depends on ARM
+ help
+ This selects support for the Synopsys DesignWare Mobile Storage IP
+ block, this provides host support for SD and MMC interfaces, in both
+ PIO and external DMA modes.
+
+config MMC_DW_IDMAC
+ bool "Internal DMAC interface"
+ depends on MMC_DW
+ help
+ This selects support for the internal DMAC block within the Synopsys
+ Designware Mobile Storage IP block. This disables the external DMA
+ interface.
+
config MMC_SH_MMCIF
tristate "SuperH Internal MMCIF support"
depends on MMC_BLOCK && (SUPERH || ARCH_SHMOBILE)
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 7b645ff..e834fb2 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_MMC_TMIO) += tmio_mmc.o
obj-$(CONFIG_MMC_CB710) += cb710-mmc.o
obj-$(CONFIG_MMC_VIA_SDMMC) += via-sdmmc.o
obj-$(CONFIG_SDH_BFIN) += bfin_sdh.o
+obj-$(CONFIG_MMC_DW) += dw_mmc.o
obj-$(CONFIG_MMC_SH_MMCIF) += sh_mmcif.o
obj-$(CONFIG_MMC_JZ4740) += jz4740_mmc.o
obj-$(CONFIG_MMC_USHC) += ushc.o
@@ -39,6 +40,8 @@ obj-$(CONFIG_MMC_SDHCI_PLTFM) += sdhci-platform.o
sdhci-platform-y := sdhci-pltfm.o
sdhci-platform-$(CONFIG_MMC_SDHCI_CNS3XXX) += sdhci-cns3xxx.o
sdhci-platform-$(CONFIG_MMC_SDHCI_ESDHC_IMX) += sdhci-esdhc-imx.o
+sdhci-platform-$(CONFIG_MMC_SDHCI_DOVE) += sdhci-dove.o
+sdhci-platform-$(CONFIG_MMC_SDHCI_TEGRA) += sdhci-tegra.o
obj-$(CONFIG_MMC_SDHCI_OF) += sdhci-of.o
sdhci-of-y := sdhci-of-core.o
diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c
index e15547c..0076c74 100644
--- a/drivers/mmc/host/davinci_mmc.c
+++ b/drivers/mmc/host/davinci_mmc.c
@@ -66,8 +66,8 @@
#define DAVINCI_MMCBLNC 0x60
#define DAVINCI_SDIOCTL 0x64
#define DAVINCI_SDIOST0 0x68
-#define DAVINCI_SDIOEN 0x6C
-#define DAVINCI_SDIOST 0x70
+#define DAVINCI_SDIOIEN 0x6C
+#define DAVINCI_SDIOIST 0x70
#define DAVINCI_MMCFIFOCTL 0x74 /* FIFO Control Register */
/* DAVINCI_MMCCTL definitions */
@@ -131,6 +131,14 @@
#define MMCFIFOCTL_ACCWD_2 (2 << 3) /* access width of 2 bytes */
#define MMCFIFOCTL_ACCWD_1 (3 << 3) /* access width of 1 byte */
+/* DAVINCI_SDIOST0 definitions */
+#define SDIOST0_DAT1_HI BIT(0)
+
+/* DAVINCI_SDIOIEN definitions */
+#define SDIOIEN_IOINTEN BIT(0)
+
+/* DAVINCI_SDIOIST definitions */
+#define SDIOIST_IOINT BIT(0)
/* MMCSD Init clock in Hz in opendrain mode */
#define MMCSD_INIT_CLOCK 200000
@@ -164,7 +172,7 @@ struct mmc_davinci_host {
unsigned int mmc_input_clk;
void __iomem *base;
struct resource *mem_res;
- int irq;
+ int mmc_irq, sdio_irq;
unsigned char bus_mode;
#define DAVINCI_MMC_DATADIR_NONE 0
@@ -184,6 +192,7 @@ struct mmc_davinci_host {
u32 rxdma, txdma;
bool use_dma;
bool do_dma;
+ bool sdio_int;
/* Scatterlist DMA uses one or more parameter RAM entries:
* the main one (associated with rxdma or txdma) plus zero or
@@ -480,7 +489,7 @@ static void mmc_davinci_send_dma_request(struct mmc_davinci_host *host,
struct scatterlist *sg;
unsigned sg_len;
unsigned bytes_left = host->bytes_left;
- const unsigned shift = ffs(rw_threshold) - 1;;
+ const unsigned shift = ffs(rw_threshold) - 1;
if (host->data_dir == DAVINCI_MMC_DATADIR_WRITE) {
template = &host->tx_template;
@@ -866,6 +875,19 @@ mmc_davinci_xfer_done(struct mmc_davinci_host *host, struct mmc_data *data)
{
host->data = NULL;
+ if (host->mmc->caps & MMC_CAP_SDIO_IRQ) {
+ /*
+ * SDIO Interrupt Detection work-around as suggested by
+ * Davinci Errata (TMS320DM355 Silicon Revision 1.1 Errata
+ * 2.1.6): Signal SDIO interrupt only if it is enabled by core
+ */
+ if (host->sdio_int && !(readl(host->base + DAVINCI_SDIOST0) &
+ SDIOST0_DAT1_HI)) {
+ writel(SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
+ mmc_signal_sdio_irq(host->mmc);
+ }
+ }
+
if (host->do_dma) {
davinci_abort_dma(host);
@@ -932,6 +954,21 @@ davinci_abort_data(struct mmc_davinci_host *host, struct mmc_data *data)
mmc_davinci_reset_ctrl(host, 0);
}
+static irqreturn_t mmc_davinci_sdio_irq(int irq, void *dev_id)
+{
+ struct mmc_davinci_host *host = dev_id;
+ unsigned int status;
+
+ status = readl(host->base + DAVINCI_SDIOIST);
+ if (status & SDIOIST_IOINT) {
+ dev_dbg(mmc_dev(host->mmc),
+ "SDIO interrupt status %x\n", status);
+ writel(status | SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
+ mmc_signal_sdio_irq(host->mmc);
+ }
+ return IRQ_HANDLED;
+}
+
static irqreturn_t mmc_davinci_irq(int irq, void *dev_id)
{
struct mmc_davinci_host *host = (struct mmc_davinci_host *)dev_id;
@@ -1076,11 +1113,32 @@ static int mmc_davinci_get_ro(struct mmc_host *mmc)
return config->get_ro(pdev->id);
}
+static void mmc_davinci_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+ struct mmc_davinci_host *host = mmc_priv(mmc);
+
+ if (enable) {
+ if (!(readl(host->base + DAVINCI_SDIOST0) & SDIOST0_DAT1_HI)) {
+ writel(SDIOIST_IOINT, host->base + DAVINCI_SDIOIST);
+ mmc_signal_sdio_irq(host->mmc);
+ } else {
+ host->sdio_int = true;
+ writel(readl(host->base + DAVINCI_SDIOIEN) |
+ SDIOIEN_IOINTEN, host->base + DAVINCI_SDIOIEN);
+ }
+ } else {
+ host->sdio_int = false;
+ writel(readl(host->base + DAVINCI_SDIOIEN) & ~SDIOIEN_IOINTEN,
+ host->base + DAVINCI_SDIOIEN);
+ }
+}
+
static struct mmc_host_ops mmc_davinci_ops = {
.request = mmc_davinci_request,
.set_ios = mmc_davinci_set_ios,
.get_cd = mmc_davinci_get_cd,
.get_ro = mmc_davinci_get_ro,
+ .enable_sdio_irq = mmc_davinci_enable_sdio_irq,
};
/*----------------------------------------------------------------------*/
@@ -1209,7 +1267,8 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
host->nr_sg = MAX_NR_SG;
host->use_dma = use_dma;
- host->irq = irq;
+ host->mmc_irq = irq;
+ host->sdio_irq = platform_get_irq(pdev, 1);
if (host->use_dma && davinci_acquire_dma_channels(host) != 0)
host->use_dma = 0;
@@ -1270,6 +1329,13 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
if (ret)
goto out;
+ if (host->sdio_irq >= 0) {
+ ret = request_irq(host->sdio_irq, mmc_davinci_sdio_irq, 0,
+ mmc_hostname(mmc), host);
+ if (!ret)
+ mmc->caps |= MMC_CAP_SDIO_IRQ;
+ }
+
rename_region(mem, mmc_hostname(mmc));
dev_info(mmc_dev(host->mmc), "Using %s, %d-bit mode\n",
@@ -1313,7 +1379,9 @@ static int __exit davinci_mmcsd_remove(struct platform_device *pdev)
mmc_davinci_cpufreq_deregister(host);
mmc_remove_host(host->mmc);
- free_irq(host->irq, host);
+ free_irq(host->mmc_irq, host);
+ if (host->mmc->caps & MMC_CAP_SDIO_IRQ)
+ free_irq(host->sdio_irq, host);
davinci_release_dma_channels(host);
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
new file mode 100644
index 0000000..2fcc825
--- /dev/null
+++ b/drivers/mmc/host/dw_mmc.c
@@ -0,0 +1,1796 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ * (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/mmc/host.h>
+#include <linux/mmc/mmc.h>
+#include <linux/mmc/dw_mmc.h>
+#include <linux/bitops.h>
+
+#include "dw_mmc.h"
+
+/* Common flag combinations */
+#define DW_MCI_DATA_ERROR_FLAGS (SDMMC_INT_DTO | SDMMC_INT_DCRC | \
+ SDMMC_INT_HTO | SDMMC_INT_SBE | \
+ SDMMC_INT_EBE)
+#define DW_MCI_CMD_ERROR_FLAGS (SDMMC_INT_RTO | SDMMC_INT_RCRC | \
+ SDMMC_INT_RESP_ERR)
+#define DW_MCI_ERROR_FLAGS (DW_MCI_DATA_ERROR_FLAGS | \
+ DW_MCI_CMD_ERROR_FLAGS | SDMMC_INT_HLE)
+#define DW_MCI_SEND_STATUS 1
+#define DW_MCI_RECV_STATUS 2
+#define DW_MCI_DMA_THRESHOLD 16
+
+#ifdef CONFIG_MMC_DW_IDMAC
+struct idmac_desc {
+ u32 des0; /* Control Descriptor */
+#define IDMAC_DES0_DIC BIT(1)
+#define IDMAC_DES0_LD BIT(2)
+#define IDMAC_DES0_FD BIT(3)
+#define IDMAC_DES0_CH BIT(4)
+#define IDMAC_DES0_ER BIT(5)
+#define IDMAC_DES0_CES BIT(30)
+#define IDMAC_DES0_OWN BIT(31)
+
+ u32 des1; /* Buffer sizes */
+#define IDMAC_SET_BUFFER1_SIZE(d, s) \
+ ((d)->des1 = ((d)->des1 & 0x03ffc000) | ((s) & 0x3fff))
+
+ u32 des2; /* buffer 1 physical address */
+
+ u32 des3; /* buffer 2 physical address */
+};
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+/**
+ * struct dw_mci_slot - MMC slot state
+ * @mmc: The mmc_host representing this slot.
+ * @host: The MMC controller this slot is using.
+ * @ctype: Card type for this slot.
+ * @mrq: mmc_request currently being processed or waiting to be
+ * processed, or NULL when the slot is idle.
+ * @queue_node: List node for placing this node in the @queue list of
+ * &struct dw_mci.
+ * @clock: Clock rate configured by set_ios(). Protected by host->lock.
+ * @flags: Random state bits associated with the slot.
+ * @id: Number of this slot.
+ * @last_detect_state: Most recently observed card detect state.
+ */
+struct dw_mci_slot {
+ struct mmc_host *mmc;
+ struct dw_mci *host;
+
+ u32 ctype;
+
+ struct mmc_request *mrq;
+ struct list_head queue_node;
+
+ unsigned int clock;
+ unsigned long flags;
+#define DW_MMC_CARD_PRESENT 0
+#define DW_MMC_CARD_NEED_INIT 1
+ int id;
+ int last_detect_state;
+};
+
+#if defined(CONFIG_DEBUG_FS)
+static int dw_mci_req_show(struct seq_file *s, void *v)
+{
+ struct dw_mci_slot *slot = s->private;
+ struct mmc_request *mrq;
+ struct mmc_command *cmd;
+ struct mmc_command *stop;
+ struct mmc_data *data;
+
+ /* Make sure we get a consistent snapshot */
+ spin_lock_bh(&slot->host->lock);
+ mrq = slot->mrq;
+
+ if (mrq) {
+ cmd = mrq->cmd;
+ data = mrq->data;
+ stop = mrq->stop;
+
+ if (cmd)
+ seq_printf(s,
+ "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
+ cmd->opcode, cmd->arg, cmd->flags,
+ cmd->resp[0], cmd->resp[1], cmd->resp[2],
+ cmd->resp[2], cmd->error);
+ if (data)
+ seq_printf(s, "DATA %u / %u * %u flg %x err %d\n",
+ data->bytes_xfered, data->blocks,
+ data->blksz, data->flags, data->error);
+ if (stop)
+ seq_printf(s,
+ "CMD%u(0x%x) flg %x rsp %x %x %x %x err %d\n",
+ stop->opcode, stop->arg, stop->flags,
+ stop->resp[0], stop->resp[1], stop->resp[2],
+ stop->resp[2], stop->error);
+ }
+
+ spin_unlock_bh(&slot->host->lock);
+
+ return 0;
+}
+
+static int dw_mci_req_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dw_mci_req_show, inode->i_private);
+}
+
+static const struct file_operations dw_mci_req_fops = {
+ .owner = THIS_MODULE,
+ .open = dw_mci_req_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dw_mci_regs_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "STATUS:\t0x%08x\n", SDMMC_STATUS);
+ seq_printf(s, "RINTSTS:\t0x%08x\n", SDMMC_RINTSTS);
+ seq_printf(s, "CMD:\t0x%08x\n", SDMMC_CMD);
+ seq_printf(s, "CTRL:\t0x%08x\n", SDMMC_CTRL);
+ seq_printf(s, "INTMASK:\t0x%08x\n", SDMMC_INTMASK);
+ seq_printf(s, "CLKENA:\t0x%08x\n", SDMMC_CLKENA);
+
+ return 0;
+}
+
+static int dw_mci_regs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dw_mci_regs_show, inode->i_private);
+}
+
+static const struct file_operations dw_mci_regs_fops = {
+ .owner = THIS_MODULE,
+ .open = dw_mci_regs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void dw_mci_init_debugfs(struct dw_mci_slot *slot)
+{
+ struct mmc_host *mmc = slot->mmc;
+ struct dw_mci *host = slot->host;
+ struct dentry *root;
+ struct dentry *node;
+
+ root = mmc->debugfs_root;
+ if (!root)
+ return;
+
+ node = debugfs_create_file("regs", S_IRUSR, root, host,
+ &dw_mci_regs_fops);
+ if (!node)
+ goto err;
+
+ node = debugfs_create_file("req", S_IRUSR, root, slot,
+ &dw_mci_req_fops);
+ if (!node)
+ goto err;
+
+ node = debugfs_create_u32("state", S_IRUSR, root, (u32 *)&host->state);
+ if (!node)
+ goto err;
+
+ node = debugfs_create_x32("pending_events", S_IRUSR, root,
+ (u32 *)&host->pending_events);
+ if (!node)
+ goto err;
+
+ node = debugfs_create_x32("completed_events", S_IRUSR, root,
+ (u32 *)&host->completed_events);
+ if (!node)
+ goto err;
+
+ return;
+
+err:
+ dev_err(&mmc->class_dev, "failed to initialize debugfs for slot\n");
+}
+#endif /* defined(CONFIG_DEBUG_FS) */
+
+static void dw_mci_set_timeout(struct dw_mci *host)
+{
+ /* timeout (maximum) */
+ mci_writel(host, TMOUT, 0xffffffff);
+}
+
+static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd)
+{
+ struct mmc_data *data;
+ u32 cmdr;
+ cmd->error = -EINPROGRESS;
+
+ cmdr = cmd->opcode;
+
+ if (cmdr == MMC_STOP_TRANSMISSION)
+ cmdr |= SDMMC_CMD_STOP;
+ else
+ cmdr |= SDMMC_CMD_PRV_DAT_WAIT;
+
+ if (cmd->flags & MMC_RSP_PRESENT) {
+ /* We expect a response, so set this bit */
+ cmdr |= SDMMC_CMD_RESP_EXP;
+ if (cmd->flags & MMC_RSP_136)
+ cmdr |= SDMMC_CMD_RESP_LONG;
+ }
+
+ if (cmd->flags & MMC_RSP_CRC)
+ cmdr |= SDMMC_CMD_RESP_CRC;
+
+ data = cmd->data;
+ if (data) {
+ cmdr |= SDMMC_CMD_DAT_EXP;
+ if (data->flags & MMC_DATA_STREAM)
+ cmdr |= SDMMC_CMD_STRM_MODE;
+ if (data->flags & MMC_DATA_WRITE)
+ cmdr |= SDMMC_CMD_DAT_WR;
+ }
+
+ return cmdr;
+}
+
+static void dw_mci_start_command(struct dw_mci *host,
+ struct mmc_command *cmd, u32 cmd_flags)
+{
+ host->cmd = cmd;
+ dev_vdbg(&host->pdev->dev,
+ "start command: ARGR=0x%08x CMDR=0x%08x\n",
+ cmd->arg, cmd_flags);
+
+ mci_writel(host, CMDARG, cmd->arg);
+ wmb();
+
+ mci_writel(host, CMD, cmd_flags | SDMMC_CMD_START);
+}
+
+static void send_stop_cmd(struct dw_mci *host, struct mmc_data *data)
+{
+ dw_mci_start_command(host, data->stop, host->stop_cmdr);
+}
+
+/* DMA interface functions */
+static void dw_mci_stop_dma(struct dw_mci *host)
+{
+ if (host->use_dma) {
+ host->dma_ops->stop(host);
+ host->dma_ops->cleanup(host);
+ } else {
+ /* Data transfer was stopped by the interrupt handler */
+ set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+ }
+}
+
+#ifdef CONFIG_MMC_DW_IDMAC
+static void dw_mci_dma_cleanup(struct dw_mci *host)
+{
+ struct mmc_data *data = host->data;
+
+ if (data)
+ dma_unmap_sg(&host->pdev->dev, data->sg, data->sg_len,
+ ((data->flags & MMC_DATA_WRITE)
+ ? DMA_TO_DEVICE : DMA_FROM_DEVICE));
+}
+
+static void dw_mci_idmac_stop_dma(struct dw_mci *host)
+{
+ u32 temp;
+
+ /* Disable and reset the IDMAC interface */
+ temp = mci_readl(host, CTRL);
+ temp &= ~SDMMC_CTRL_USE_IDMAC;
+ temp |= SDMMC_CTRL_DMA_RESET;
+ mci_writel(host, CTRL, temp);
+
+ /* Stop the IDMAC running */
+ temp = mci_readl(host, BMOD);
+ temp &= ~SDMMC_IDMAC_ENABLE;
+ mci_writel(host, BMOD, temp);
+}
+
+static void dw_mci_idmac_complete_dma(struct dw_mci *host)
+{
+ struct mmc_data *data = host->data;
+
+ dev_vdbg(&host->pdev->dev, "DMA complete\n");
+
+ host->dma_ops->cleanup(host);
+
+ /*
+ * If the card was removed, data will be NULL. No point in trying to
+ * send the stop command or waiting for NBUSY in this case.
+ */
+ if (data) {
+ set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+ tasklet_schedule(&host->tasklet);
+ }
+}
+
+static void dw_mci_translate_sglist(struct dw_mci *host, struct mmc_data *data,
+ unsigned int sg_len)
+{
+ int i;
+ struct idmac_desc *desc = host->sg_cpu;
+
+ for (i = 0; i < sg_len; i++, desc++) {
+ unsigned int length = sg_dma_len(&data->sg[i]);
+ u32 mem_addr = sg_dma_address(&data->sg[i]);
+
+ /* Set the OWN bit and disable interrupts for this descriptor */
+ desc->des0 = IDMAC_DES0_OWN | IDMAC_DES0_DIC | IDMAC_DES0_CH;
+
+ /* Buffer length */
+ IDMAC_SET_BUFFER1_SIZE(desc, length);
+
+ /* Physical address to DMA to/from */
+ desc->des2 = mem_addr;
+ }
+
+ /* Set first descriptor */
+ desc = host->sg_cpu;
+ desc->des0 |= IDMAC_DES0_FD;
+
+ /* Set last descriptor */
+ desc = host->sg_cpu + (i - 1) * sizeof(struct idmac_desc);
+ desc->des0 &= ~(IDMAC_DES0_CH | IDMAC_DES0_DIC);
+ desc->des0 |= IDMAC_DES0_LD;
+
+ wmb();
+}
+
+static void dw_mci_idmac_start_dma(struct dw_mci *host, unsigned int sg_len)
+{
+ u32 temp;
+
+ dw_mci_translate_sglist(host, host->data, sg_len);
+
+ /* Select IDMAC interface */
+ temp = mci_readl(host, CTRL);
+ temp |= SDMMC_CTRL_USE_IDMAC;
+ mci_writel(host, CTRL, temp);
+
+ wmb();
+
+ /* Enable the IDMAC */
+ temp = mci_readl(host, BMOD);
+ temp |= SDMMC_IDMAC_ENABLE;
+ mci_writel(host, BMOD, temp);
+
+ /* Start it running */
+ mci_writel(host, PLDMND, 1);
+}
+
+static int dw_mci_idmac_init(struct dw_mci *host)
+{
+ struct idmac_desc *p;
+ int i;
+
+ /* Number of descriptors in the ring buffer */
+ host->ring_size = PAGE_SIZE / sizeof(struct idmac_desc);
+
+ /* Forward link the descriptor list */
+ for (i = 0, p = host->sg_cpu; i < host->ring_size - 1; i++, p++)
+ p->des3 = host->sg_dma + (sizeof(struct idmac_desc) * (i + 1));
+
+ /* Set the last descriptor as the end-of-ring descriptor */
+ p->des3 = host->sg_dma;
+ p->des0 = IDMAC_DES0_ER;
+
+ /* Mask out interrupts - get Tx & Rx complete only */
+ mci_writel(host, IDINTEN, SDMMC_IDMAC_INT_NI | SDMMC_IDMAC_INT_RI |
+ SDMMC_IDMAC_INT_TI);
+
+ /* Set the descriptor base address */
+ mci_writel(host, DBADDR, host->sg_dma);
+ return 0;
+}
+
+static struct dw_mci_dma_ops dw_mci_idmac_ops = {
+ .init = dw_mci_idmac_init,
+ .start = dw_mci_idmac_start_dma,
+ .stop = dw_mci_idmac_stop_dma,
+ .complete = dw_mci_idmac_complete_dma,
+ .cleanup = dw_mci_dma_cleanup,
+};
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
+{
+ struct scatterlist *sg;
+ unsigned int i, direction, sg_len;
+ u32 temp;
+
+ /* If we don't have a channel, we can't do DMA */
+ if (!host->use_dma)
+ return -ENODEV;
+
+ /*
+ * We don't do DMA on "complex" transfers, i.e. with
+ * non-word-aligned buffers or lengths. Also, we don't bother
+ * with all the DMA setup overhead for short transfers.
+ */
+ if (data->blocks * data->blksz < DW_MCI_DMA_THRESHOLD)
+ return -EINVAL;
+ if (data->blksz & 3)
+ return -EINVAL;
+
+ for_each_sg(data->sg, sg, data->sg_len, i) {
+ if (sg->offset & 3 || sg->length & 3)
+ return -EINVAL;
+ }
+
+ if (data->flags & MMC_DATA_READ)
+ direction = DMA_FROM_DEVICE;
+ else
+ direction = DMA_TO_DEVICE;
+
+ sg_len = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len,
+ direction);
+
+ dev_vdbg(&host->pdev->dev,
+ "sd sg_cpu: %#lx sg_dma: %#lx sg_len: %d\n",
+ (unsigned long)host->sg_cpu, (unsigned long)host->sg_dma,
+ sg_len);
+
+ /* Enable the DMA interface */
+ temp = mci_readl(host, CTRL);
+ temp |= SDMMC_CTRL_DMA_ENABLE;
+ mci_writel(host, CTRL, temp);
+
+ /* Disable RX/TX IRQs, let DMA handle it */
+ temp = mci_readl(host, INTMASK);
+ temp &= ~(SDMMC_INT_RXDR | SDMMC_INT_TXDR);
+ mci_writel(host, INTMASK, temp);
+
+ host->dma_ops->start(host, sg_len);
+
+ return 0;
+}
+
+static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
+{
+ u32 temp;
+
+ data->error = -EINPROGRESS;
+
+ WARN_ON(host->data);
+ host->sg = NULL;
+ host->data = data;
+
+ if (dw_mci_submit_data_dma(host, data)) {
+ host->sg = data->sg;
+ host->pio_offset = 0;
+ if (data->flags & MMC_DATA_READ)
+ host->dir_status = DW_MCI_RECV_STATUS;
+ else
+ host->dir_status = DW_MCI_SEND_STATUS;
+
+ temp = mci_readl(host, INTMASK);
+ temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR;
+ mci_writel(host, INTMASK, temp);
+
+ temp = mci_readl(host, CTRL);
+ temp &= ~SDMMC_CTRL_DMA_ENABLE;
+ mci_writel(host, CTRL, temp);
+ }
+}
+
+static void mci_send_cmd(struct dw_mci_slot *slot, u32 cmd, u32 arg)
+{
+ struct dw_mci *host = slot->host;
+ unsigned long timeout = jiffies + msecs_to_jiffies(500);
+ unsigned int cmd_status = 0;
+
+ mci_writel(host, CMDARG, arg);
+ wmb();
+ mci_writel(host, CMD, SDMMC_CMD_START | cmd);
+
+ while (time_before(jiffies, timeout)) {
+ cmd_status = mci_readl(host, CMD);
+ if (!(cmd_status & SDMMC_CMD_START))
+ return;
+ }
+ dev_err(&slot->mmc->class_dev,
+ "Timeout sending command (cmd %#x arg %#x status %#x)\n",
+ cmd, arg, cmd_status);
+}
+
+static void dw_mci_setup_bus(struct dw_mci_slot *slot)
+{
+ struct dw_mci *host = slot->host;
+ u32 div;
+
+ if (slot->clock != host->current_speed) {
+ if (host->bus_hz % slot->clock)
+ /*
+ * move the + 1 after the divide to prevent
+ * over-clocking the card.
+ */
+ div = ((host->bus_hz / slot->clock) >> 1) + 1;
+ else
+ div = (host->bus_hz / slot->clock) >> 1;
+
+ dev_info(&slot->mmc->class_dev,
+ "Bus speed (slot %d) = %dHz (slot req %dHz, actual %dHZ"
+ " div = %d)\n", slot->id, host->bus_hz, slot->clock,
+ div ? ((host->bus_hz / div) >> 1) : host->bus_hz, div);
+
+ /* disable clock */
+ mci_writel(host, CLKENA, 0);
+ mci_writel(host, CLKSRC, 0);
+
+ /* inform CIU */
+ mci_send_cmd(slot,
+ SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+ /* set clock to desired speed */
+ mci_writel(host, CLKDIV, div);
+
+ /* inform CIU */
+ mci_send_cmd(slot,
+ SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+ /* enable clock */
+ mci_writel(host, CLKENA, SDMMC_CLKEN_ENABLE);
+
+ /* inform CIU */
+ mci_send_cmd(slot,
+ SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0);
+
+ host->current_speed = slot->clock;
+ }
+
+ /* Set the current slot bus width */
+ mci_writel(host, CTYPE, slot->ctype);
+}
+
+static void dw_mci_start_request(struct dw_mci *host,
+ struct dw_mci_slot *slot)
+{
+ struct mmc_request *mrq;
+ struct mmc_command *cmd;
+ struct mmc_data *data;
+ u32 cmdflags;
+
+ mrq = slot->mrq;
+ if (host->pdata->select_slot)
+ host->pdata->select_slot(slot->id);
+
+ /* Slot specific timing and width adjustment */
+ dw_mci_setup_bus(slot);
+
+ host->cur_slot = slot;
+ host->mrq = mrq;
+
+ host->pending_events = 0;
+ host->completed_events = 0;
+ host->data_status = 0;
+
+ data = mrq->data;
+ if (data) {
+ dw_mci_set_timeout(host);
+ mci_writel(host, BYTCNT, data->blksz*data->blocks);
+ mci_writel(host, BLKSIZ, data->blksz);
+ }
+
+ cmd = mrq->cmd;
+ cmdflags = dw_mci_prepare_command(slot->mmc, cmd);
+
+ /* this is the first command, send the initialization clock */
+ if (test_and_clear_bit(DW_MMC_CARD_NEED_INIT, &slot->flags))
+ cmdflags |= SDMMC_CMD_INIT;
+
+ if (data) {
+ dw_mci_submit_data(host, data);
+ wmb();
+ }
+
+ dw_mci_start_command(host, cmd, cmdflags);
+
+ if (mrq->stop)
+ host->stop_cmdr = dw_mci_prepare_command(slot->mmc, mrq->stop);
+}
+
+static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot,
+ struct mmc_request *mrq)
+{
+ dev_vdbg(&slot->mmc->class_dev, "queue request: state=%d\n",
+ host->state);
+
+ spin_lock_bh(&host->lock);
+ slot->mrq = mrq;
+
+ if (host->state == STATE_IDLE) {
+ host->state = STATE_SENDING_CMD;
+ dw_mci_start_request(host, slot);
+ } else {
+ list_add_tail(&slot->queue_node, &host->queue);
+ }
+
+ spin_unlock_bh(&host->lock);
+}
+
+static void dw_mci_request(struct mmc_host *mmc, struct mmc_request *mrq)
+{
+ struct dw_mci_slot *slot = mmc_priv(mmc);
+ struct dw_mci *host = slot->host;
+
+ WARN_ON(slot->mrq);
+
+ if (!test_bit(DW_MMC_CARD_PRESENT, &slot->flags)) {
+ mrq->cmd->error = -ENOMEDIUM;
+ mmc_request_done(mmc, mrq);
+ return;
+ }
+
+ /* We don't support multiple blocks of weird lengths. */
+ dw_mci_queue_request(host, slot, mrq);
+}
+
+static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+ struct dw_mci_slot *slot = mmc_priv(mmc);
+
+ /* set default 1 bit mode */
+ slot->ctype = SDMMC_CTYPE_1BIT;
+
+ switch (ios->bus_width) {
+ case MMC_BUS_WIDTH_1:
+ slot->ctype = SDMMC_CTYPE_1BIT;
+ break;
+ case MMC_BUS_WIDTH_4:
+ slot->ctype = SDMMC_CTYPE_4BIT;
+ break;
+ }
+
+ if (ios->clock) {
+ /*
+ * Use mirror of ios->clock to prevent race with mmc
+ * core ios update when finding the minimum.
+ */
+ slot->clock = ios->clock;
+ }
+
+ switch (ios->power_mode) {
+ case MMC_POWER_UP:
+ set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags);
+ break;
+ default:
+ break;
+ }
+}
+
+static int dw_mci_get_ro(struct mmc_host *mmc)
+{
+ int read_only;
+ struct dw_mci_slot *slot = mmc_priv(mmc);
+ struct dw_mci_board *brd = slot->host->pdata;
+
+ /* Use platform get_ro function, else try on board write protect */
+ if (brd->get_ro)
+ read_only = brd->get_ro(slot->id);
+ else
+ read_only =
+ mci_readl(slot->host, WRTPRT) & (1 << slot->id) ? 1 : 0;
+
+ dev_dbg(&mmc->class_dev, "card is %s\n",
+ read_only ? "read-only" : "read-write");
+
+ return read_only;
+}
+
+static int dw_mci_get_cd(struct mmc_host *mmc)
+{
+ int present;
+ struct dw_mci_slot *slot = mmc_priv(mmc);
+ struct dw_mci_board *brd = slot->host->pdata;
+
+ /* Use platform get_cd function, else try onboard card detect */
+ if (brd->get_cd)
+ present = !brd->get_cd(slot->id);
+ else
+ present = (mci_readl(slot->host, CDETECT) & (1 << slot->id))
+ == 0 ? 1 : 0;
+
+ if (present)
+ dev_dbg(&mmc->class_dev, "card is present\n");
+ else
+ dev_dbg(&mmc->class_dev, "card is not present\n");
+
+ return present;
+}
+
+static const struct mmc_host_ops dw_mci_ops = {
+ .request = dw_mci_request,
+ .set_ios = dw_mci_set_ios,
+ .get_ro = dw_mci_get_ro,
+ .get_cd = dw_mci_get_cd,
+};
+
+static void dw_mci_request_end(struct dw_mci *host, struct mmc_request *mrq)
+ __releases(&host->lock)
+ __acquires(&host->lock)
+{
+ struct dw_mci_slot *slot;
+ struct mmc_host *prev_mmc = host->cur_slot->mmc;
+
+ WARN_ON(host->cmd || host->data);
+
+ host->cur_slot->mrq = NULL;
+ host->mrq = NULL;
+ if (!list_empty(&host->queue)) {
+ slot = list_entry(host->queue.next,
+ struct dw_mci_slot, queue_node);
+ list_del(&slot->queue_node);
+ dev_vdbg(&host->pdev->dev, "list not empty: %s is next\n",
+ mmc_hostname(slot->mmc));
+ host->state = STATE_SENDING_CMD;
+ dw_mci_start_request(host, slot);
+ } else {
+ dev_vdbg(&host->pdev->dev, "list empty\n");
+ host->state = STATE_IDLE;
+ }
+
+ spin_unlock(&host->lock);
+ mmc_request_done(prev_mmc, mrq);
+ spin_lock(&host->lock);
+}
+
+static void dw_mci_command_complete(struct dw_mci *host, struct mmc_command *cmd)
+{
+ u32 status = host->cmd_status;
+
+ host->cmd_status = 0;
+
+ /* Read the response from the card (up to 16 bytes) */
+ if (cmd->flags & MMC_RSP_PRESENT) {
+ if (cmd->flags & MMC_RSP_136) {
+ cmd->resp[3] = mci_readl(host, RESP0);
+ cmd->resp[2] = mci_readl(host, RESP1);
+ cmd->resp[1] = mci_readl(host, RESP2);
+ cmd->resp[0] = mci_readl(host, RESP3);
+ } else {
+ cmd->resp[0] = mci_readl(host, RESP0);
+ cmd->resp[1] = 0;
+ cmd->resp[2] = 0;
+ cmd->resp[3] = 0;
+ }
+ }
+
+ if (status & SDMMC_INT_RTO)
+ cmd->error = -ETIMEDOUT;
+ else if ((cmd->flags & MMC_RSP_CRC) && (status & SDMMC_INT_RCRC))
+ cmd->error = -EILSEQ;
+ else if (status & SDMMC_INT_RESP_ERR)
+ cmd->error = -EIO;
+ else
+ cmd->error = 0;
+
+ if (cmd->error) {
+ /* newer ip versions need a delay between retries */
+ if (host->quirks & DW_MCI_QUIRK_RETRY_DELAY)
+ mdelay(20);
+
+ if (cmd->data) {
+ host->data = NULL;
+ dw_mci_stop_dma(host);
+ }
+ }
+}
+
+static void dw_mci_tasklet_func(unsigned long priv)
+{
+ struct dw_mci *host = (struct dw_mci *)priv;
+ struct mmc_data *data;
+ struct mmc_command *cmd;
+ enum dw_mci_state state;
+ enum dw_mci_state prev_state;
+ u32 status;
+
+ spin_lock(&host->lock);
+
+ state = host->state;
+ data = host->data;
+
+ do {
+ prev_state = state;
+
+ switch (state) {
+ case STATE_IDLE:
+ break;
+
+ case STATE_SENDING_CMD:
+ if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
+ &host->pending_events))
+ break;
+
+ cmd = host->cmd;
+ host->cmd = NULL;
+ set_bit(EVENT_CMD_COMPLETE, &host->completed_events);
+ dw_mci_command_complete(host, host->mrq->cmd);
+ if (!host->mrq->data || cmd->error) {
+ dw_mci_request_end(host, host->mrq);
+ goto unlock;
+ }
+
+ prev_state = state = STATE_SENDING_DATA;
+ /* fall through */
+
+ case STATE_SENDING_DATA:
+ if (test_and_clear_bit(EVENT_DATA_ERROR,
+ &host->pending_events)) {
+ dw_mci_stop_dma(host);
+ if (data->stop)
+ send_stop_cmd(host, data);
+ state = STATE_DATA_ERROR;
+ break;
+ }
+
+ if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
+ &host->pending_events))
+ break;
+
+ set_bit(EVENT_XFER_COMPLETE, &host->completed_events);
+ prev_state = state = STATE_DATA_BUSY;
+ /* fall through */
+
+ case STATE_DATA_BUSY:
+ if (!test_and_clear_bit(EVENT_DATA_COMPLETE,
+ &host->pending_events))
+ break;
+
+ host->data = NULL;
+ set_bit(EVENT_DATA_COMPLETE, &host->completed_events);
+ status = host->data_status;
+
+ if (status & DW_MCI_DATA_ERROR_FLAGS) {
+ if (status & SDMMC_INT_DTO) {
+ dev_err(&host->pdev->dev,
+ "data timeout error\n");
+ data->error = -ETIMEDOUT;
+ } else if (status & SDMMC_INT_DCRC) {
+ dev_err(&host->pdev->dev,
+ "data CRC error\n");
+ data->error = -EILSEQ;
+ } else {
+ dev_err(&host->pdev->dev,
+ "data FIFO error "
+ "(status=%08x)\n",
+ status);
+ data->error = -EIO;
+ }
+ } else {
+ data->bytes_xfered = data->blocks * data->blksz;
+ data->error = 0;
+ }
+
+ if (!data->stop) {
+ dw_mci_request_end(host, host->mrq);
+ goto unlock;
+ }
+
+ prev_state = state = STATE_SENDING_STOP;
+ if (!data->error)
+ send_stop_cmd(host, data);
+ /* fall through */
+
+ case STATE_SENDING_STOP:
+ if (!test_and_clear_bit(EVENT_CMD_COMPLETE,
+ &host->pending_events))
+ break;
+
+ host->cmd = NULL;
+ dw_mci_command_complete(host, host->mrq->stop);
+ dw_mci_request_end(host, host->mrq);
+ goto unlock;
+
+ case STATE_DATA_ERROR:
+ if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
+ &host->pending_events))
+ break;
+
+ state = STATE_DATA_BUSY;
+ break;
+ }
+ } while (state != prev_state);
+
+ host->state = state;
+unlock:
+ spin_unlock(&host->lock);
+
+}
+
+static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
+{
+ u16 *pdata = (u16 *)buf;
+
+ WARN_ON(cnt % 2 != 0);
+
+ cnt = cnt >> 1;
+ while (cnt > 0) {
+ mci_writew(host, DATA, *pdata++);
+ cnt--;
+ }
+}
+
+static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
+{
+ u16 *pdata = (u16 *)buf;
+
+ WARN_ON(cnt % 2 != 0);
+
+ cnt = cnt >> 1;
+ while (cnt > 0) {
+ *pdata++ = mci_readw(host, DATA);
+ cnt--;
+ }
+}
+
+static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
+{
+ u32 *pdata = (u32 *)buf;
+
+ WARN_ON(cnt % 4 != 0);
+ WARN_ON((unsigned long)pdata & 0x3);
+
+ cnt = cnt >> 2;
+ while (cnt > 0) {
+ mci_writel(host, DATA, *pdata++);
+ cnt--;
+ }
+}
+
+static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
+{
+ u32 *pdata = (u32 *)buf;
+
+ WARN_ON(cnt % 4 != 0);
+ WARN_ON((unsigned long)pdata & 0x3);
+
+ cnt = cnt >> 2;
+ while (cnt > 0) {
+ *pdata++ = mci_readl(host, DATA);
+ cnt--;
+ }
+}
+
+static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
+{
+ u64 *pdata = (u64 *)buf;
+
+ WARN_ON(cnt % 8 != 0);
+
+ cnt = cnt >> 3;
+ while (cnt > 0) {
+ mci_writeq(host, DATA, *pdata++);
+ cnt--;
+ }
+}
+
+static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
+{
+ u64 *pdata = (u64 *)buf;
+
+ WARN_ON(cnt % 8 != 0);
+
+ cnt = cnt >> 3;
+ while (cnt > 0) {
+ *pdata++ = mci_readq(host, DATA);
+ cnt--;
+ }
+}
+
+static void dw_mci_read_data_pio(struct dw_mci *host)
+{
+ struct scatterlist *sg = host->sg;
+ void *buf = sg_virt(sg);
+ unsigned int offset = host->pio_offset;
+ struct mmc_data *data = host->data;
+ int shift = host->data_shift;
+ u32 status;
+ unsigned int nbytes = 0, len, old_len, count = 0;
+
+ do {
+ len = SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift;
+ if (count == 0)
+ old_len = len;
+
+ if (offset + len <= sg->length) {
+ host->pull_data(host, (void *)(buf + offset), len);
+
+ offset += len;
+ nbytes += len;
+
+ if (offset == sg->length) {
+ flush_dcache_page(sg_page(sg));
+ host->sg = sg = sg_next(sg);
+ if (!sg)
+ goto done;
+
+ offset = 0;
+ buf = sg_virt(sg);
+ }
+ } else {
+ unsigned int remaining = sg->length - offset;
+ host->pull_data(host, (void *)(buf + offset),
+ remaining);
+ nbytes += remaining;
+
+ flush_dcache_page(sg_page(sg));
+ host->sg = sg = sg_next(sg);
+ if (!sg)
+ goto done;
+
+ offset = len - remaining;
+ buf = sg_virt(sg);
+ host->pull_data(host, buf, offset);
+ nbytes += offset;
+ }
+
+ status = mci_readl(host, MINTSTS);
+ mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
+ if (status & DW_MCI_DATA_ERROR_FLAGS) {
+ host->data_status = status;
+ data->bytes_xfered += nbytes;
+ smp_wmb();
+
+ set_bit(EVENT_DATA_ERROR, &host->pending_events);
+
+ tasklet_schedule(&host->tasklet);
+ return;
+ }
+ count++;
+ } while (status & SDMMC_INT_RXDR); /*if the RXDR is ready read again*/
+ len = SDMMC_GET_FCNT(mci_readl(host, STATUS));
+ host->pio_offset = offset;
+ data->bytes_xfered += nbytes;
+ return;
+
+done:
+ data->bytes_xfered += nbytes;
+ smp_wmb();
+ set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+}
+
+static void dw_mci_write_data_pio(struct dw_mci *host)
+{
+ struct scatterlist *sg = host->sg;
+ void *buf = sg_virt(sg);
+ unsigned int offset = host->pio_offset;
+ struct mmc_data *data = host->data;
+ int shift = host->data_shift;
+ u32 status;
+ unsigned int nbytes = 0, len;
+
+ do {
+ len = SDMMC_FIFO_SZ -
+ (SDMMC_GET_FCNT(mci_readl(host, STATUS)) << shift);
+ if (offset + len <= sg->length) {
+ host->push_data(host, (void *)(buf + offset), len);
+
+ offset += len;
+ nbytes += len;
+ if (offset == sg->length) {
+ host->sg = sg = sg_next(sg);
+ if (!sg)
+ goto done;
+
+ offset = 0;
+ buf = sg_virt(sg);
+ }
+ } else {
+ unsigned int remaining = sg->length - offset;
+
+ host->push_data(host, (void *)(buf + offset),
+ remaining);
+ nbytes += remaining;
+
+ host->sg = sg = sg_next(sg);
+ if (!sg)
+ goto done;
+
+ offset = len - remaining;
+ buf = sg_virt(sg);
+ host->push_data(host, (void *)buf, offset);
+ nbytes += offset;
+ }
+
+ status = mci_readl(host, MINTSTS);
+ mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
+ if (status & DW_MCI_DATA_ERROR_FLAGS) {
+ host->data_status = status;
+ data->bytes_xfered += nbytes;
+
+ smp_wmb();
+
+ set_bit(EVENT_DATA_ERROR, &host->pending_events);
+
+ tasklet_schedule(&host->tasklet);
+ return;
+ }
+ } while (status & SDMMC_INT_TXDR); /* if TXDR write again */
+
+ host->pio_offset = offset;
+ data->bytes_xfered += nbytes;
+
+ return;
+
+done:
+ data->bytes_xfered += nbytes;
+ smp_wmb();
+ set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
+}
+
+static void dw_mci_cmd_interrupt(struct dw_mci *host, u32 status)
+{
+ if (!host->cmd_status)
+ host->cmd_status = status;
+
+ smp_wmb();
+
+ set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
+ tasklet_schedule(&host->tasklet);
+}
+
+static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
+{
+ struct dw_mci *host = dev_id;
+ u32 status, pending;
+ unsigned int pass_count = 0;
+
+ do {
+ status = mci_readl(host, RINTSTS);
+ pending = mci_readl(host, MINTSTS); /* read-only mask reg */
+
+ /*
+ * DTO fix - version 2.10a and below, and only if internal DMA
+ * is configured.
+ */
+ if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO) {
+ if (!pending &&
+ ((mci_readl(host, STATUS) >> 17) & 0x1fff))
+ pending |= SDMMC_INT_DATA_OVER;
+ }
+
+ if (!pending)
+ break;
+
+ if (pending & DW_MCI_CMD_ERROR_FLAGS) {
+ mci_writel(host, RINTSTS, DW_MCI_CMD_ERROR_FLAGS);
+ host->cmd_status = status;
+ smp_wmb();
+ set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
+ tasklet_schedule(&host->tasklet);
+ }
+
+ if (pending & DW_MCI_DATA_ERROR_FLAGS) {
+ /* if there is an error report DATA_ERROR */
+ mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS);
+ host->data_status = status;
+ smp_wmb();
+ set_bit(EVENT_DATA_ERROR, &host->pending_events);
+ tasklet_schedule(&host->tasklet);
+ }
+
+ if (pending & SDMMC_INT_DATA_OVER) {
+ mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER);
+ if (!host->data_status)
+ host->data_status = status;
+ smp_wmb();
+ if (host->dir_status == DW_MCI_RECV_STATUS) {
+ if (host->sg != NULL)
+ dw_mci_read_data_pio(host);
+ }
+ set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+ tasklet_schedule(&host->tasklet);
+ }
+
+ if (pending & SDMMC_INT_RXDR) {
+ mci_writel(host, RINTSTS, SDMMC_INT_RXDR);
+ if (host->sg)
+ dw_mci_read_data_pio(host);
+ }
+
+ if (pending & SDMMC_INT_TXDR) {
+ mci_writel(host, RINTSTS, SDMMC_INT_TXDR);
+ if (host->sg)
+ dw_mci_write_data_pio(host);
+ }
+
+ if (pending & SDMMC_INT_CMD_DONE) {
+ mci_writel(host, RINTSTS, SDMMC_INT_CMD_DONE);
+ dw_mci_cmd_interrupt(host, status);
+ }
+
+ if (pending & SDMMC_INT_CD) {
+ mci_writel(host, RINTSTS, SDMMC_INT_CD);
+ tasklet_schedule(&host->card_tasklet);
+ }
+
+ } while (pass_count++ < 5);
+
+#ifdef CONFIG_MMC_DW_IDMAC
+ /* Handle DMA interrupts */
+ pending = mci_readl(host, IDSTS);
+ if (pending & (SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI)) {
+ mci_writel(host, IDSTS, SDMMC_IDMAC_INT_TI | SDMMC_IDMAC_INT_RI);
+ mci_writel(host, IDSTS, SDMMC_IDMAC_INT_NI);
+ set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+ host->dma_ops->complete(host);
+ }
+#endif
+
+ return IRQ_HANDLED;
+}
+
+static void dw_mci_tasklet_card(unsigned long data)
+{
+ struct dw_mci *host = (struct dw_mci *)data;
+ int i;
+
+ for (i = 0; i < host->num_slots; i++) {
+ struct dw_mci_slot *slot = host->slot[i];
+ struct mmc_host *mmc = slot->mmc;
+ struct mmc_request *mrq;
+ int present;
+ u32 ctrl;
+
+ present = dw_mci_get_cd(mmc);
+ while (present != slot->last_detect_state) {
+ spin_lock(&host->lock);
+
+ dev_dbg(&slot->mmc->class_dev, "card %s\n",
+ present ? "inserted" : "removed");
+
+ /* Card change detected */
+ slot->last_detect_state = present;
+
+ /* Power up slot */
+ if (present != 0) {
+ if (host->pdata->setpower)
+ host->pdata->setpower(slot->id,
+ mmc->ocr_avail);
+
+ set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+ }
+
+ /* Clean up queue if present */
+ mrq = slot->mrq;
+ if (mrq) {
+ if (mrq == host->mrq) {
+ host->data = NULL;
+ host->cmd = NULL;
+
+ switch (host->state) {
+ case STATE_IDLE:
+ break;
+ case STATE_SENDING_CMD:
+ mrq->cmd->error = -ENOMEDIUM;
+ if (!mrq->data)
+ break;
+ /* fall through */
+ case STATE_SENDING_DATA:
+ mrq->data->error = -ENOMEDIUM;
+ dw_mci_stop_dma(host);
+ break;
+ case STATE_DATA_BUSY:
+ case STATE_DATA_ERROR:
+ if (mrq->data->error == -EINPROGRESS)
+ mrq->data->error = -ENOMEDIUM;
+ if (!mrq->stop)
+ break;
+ /* fall through */
+ case STATE_SENDING_STOP:
+ mrq->stop->error = -ENOMEDIUM;
+ break;
+ }
+
+ dw_mci_request_end(host, mrq);
+ } else {
+ list_del(&slot->queue_node);
+ mrq->cmd->error = -ENOMEDIUM;
+ if (mrq->data)
+ mrq->data->error = -ENOMEDIUM;
+ if (mrq->stop)
+ mrq->stop->error = -ENOMEDIUM;
+
+ spin_unlock(&host->lock);
+ mmc_request_done(slot->mmc, mrq);
+ spin_lock(&host->lock);
+ }
+ }
+
+ /* Power down slot */
+ if (present == 0) {
+ if (host->pdata->setpower)
+ host->pdata->setpower(slot->id, 0);
+ clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+
+ /*
+ * Clear down the FIFO - doing so generates a
+ * block interrupt, hence setting the
+ * scatter-gather pointer to NULL.
+ */
+ host->sg = NULL;
+
+ ctrl = mci_readl(host, CTRL);
+ ctrl |= SDMMC_CTRL_FIFO_RESET;
+ mci_writel(host, CTRL, ctrl);
+
+#ifdef CONFIG_MMC_DW_IDMAC
+ ctrl = mci_readl(host, BMOD);
+ ctrl |= 0x01; /* Software reset of DMA */
+ mci_writel(host, BMOD, ctrl);
+#endif
+
+ }
+
+ spin_unlock(&host->lock);
+ present = dw_mci_get_cd(mmc);
+ }
+
+ mmc_detect_change(slot->mmc,
+ msecs_to_jiffies(host->pdata->detect_delay_ms));
+ }
+}
+
+static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id)
+{
+ struct mmc_host *mmc;
+ struct dw_mci_slot *slot;
+
+ mmc = mmc_alloc_host(sizeof(struct dw_mci_slot), &host->pdev->dev);
+ if (!mmc)
+ return -ENOMEM;
+
+ slot = mmc_priv(mmc);
+ slot->id = id;
+ slot->mmc = mmc;
+ slot->host = host;
+
+ mmc->ops = &dw_mci_ops;
+ mmc->f_min = DIV_ROUND_UP(host->bus_hz, 510);
+ mmc->f_max = host->bus_hz;
+
+ if (host->pdata->get_ocr)
+ mmc->ocr_avail = host->pdata->get_ocr(id);
+ else
+ mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
+
+ /*
+ * Start with slot power disabled, it will be enabled when a card
+ * is detected.
+ */
+ if (host->pdata->setpower)
+ host->pdata->setpower(id, 0);
+
+ mmc->caps = 0;
+ if (host->pdata->get_bus_wd)
+ if (host->pdata->get_bus_wd(slot->id) >= 4)
+ mmc->caps |= MMC_CAP_4_BIT_DATA;
+
+ if (host->pdata->quirks & DW_MCI_QUIRK_HIGHSPEED)
+ mmc->caps |= MMC_CAP_SD_HIGHSPEED;
+
+#ifdef CONFIG_MMC_DW_IDMAC
+ mmc->max_segs = host->ring_size;
+ mmc->max_blk_size = 65536;
+ mmc->max_blk_count = host->ring_size;
+ mmc->max_seg_size = 0x1000;
+ mmc->max_req_size = mmc->max_seg_size * mmc->max_blk_count;
+#else
+ if (host->pdata->blk_settings) {
+ mmc->max_segs = host->pdata->blk_settings->max_segs;
+ mmc->max_blk_size = host->pdata->blk_settings->max_blk_size;
+ mmc->max_blk_count = host->pdata->blk_settings->max_blk_count;
+ mmc->max_req_size = host->pdata->blk_settings->max_req_size;
+ mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
+ } else {
+ /* Useful defaults if platform data is unset. */
+ mmc->max_segs = 64;
+ mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+ mmc->max_blk_count = 512;
+ mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
+ mmc->max_seg_size = mmc->max_req_size;
+ }
+#endif /* CONFIG_MMC_DW_IDMAC */
+
+ if (dw_mci_get_cd(mmc))
+ set_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+ else
+ clear_bit(DW_MMC_CARD_PRESENT, &slot->flags);
+
+ host->slot[id] = slot;
+ mmc_add_host(mmc);
+
+#if defined(CONFIG_DEBUG_FS)
+ dw_mci_init_debugfs(slot);
+#endif
+
+ /* Card initially undetected */
+ slot->last_detect_state = 0;
+
+ return 0;
+}
+
+static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id)
+{
+ /* Shutdown detect IRQ */
+ if (slot->host->pdata->exit)
+ slot->host->pdata->exit(id);
+
+ /* Debugfs stuff is cleaned up by mmc core */
+ mmc_remove_host(slot->mmc);
+ slot->host->slot[id] = NULL;
+ mmc_free_host(slot->mmc);
+}
+
+static void dw_mci_init_dma(struct dw_mci *host)
+{
+ /* Alloc memory for sg translation */
+ host->sg_cpu = dma_alloc_coherent(&host->pdev->dev, PAGE_SIZE,
+ &host->sg_dma, GFP_KERNEL);
+ if (!host->sg_cpu) {
+ dev_err(&host->pdev->dev, "%s: could not alloc DMA memory\n",
+ __func__);
+ goto no_dma;
+ }
+
+ /* Determine which DMA interface to use */
+#ifdef CONFIG_MMC_DW_IDMAC
+ host->dma_ops = &dw_mci_idmac_ops;
+ dev_info(&host->pdev->dev, "Using internal DMA controller.\n");
+#endif
+
+ if (!host->dma_ops)
+ goto no_dma;
+
+ if (host->dma_ops->init) {
+ if (host->dma_ops->init(host)) {
+ dev_err(&host->pdev->dev, "%s: Unable to initialize "
+ "DMA Controller.\n", __func__);
+ goto no_dma;
+ }
+ } else {
+ dev_err(&host->pdev->dev, "DMA initialization not found.\n");
+ goto no_dma;
+ }
+
+ host->use_dma = 1;
+ return;
+
+no_dma:
+ dev_info(&host->pdev->dev, "Using PIO mode.\n");
+ host->use_dma = 0;
+ return;
+}
+
+static bool mci_wait_reset(struct device *dev, struct dw_mci *host)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(500);
+ unsigned int ctrl;
+
+ mci_writel(host, CTRL, (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
+ SDMMC_CTRL_DMA_RESET));
+
+ /* wait till resets clear */
+ do {
+ ctrl = mci_readl(host, CTRL);
+ if (!(ctrl & (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET |
+ SDMMC_CTRL_DMA_RESET)))
+ return true;
+ } while (time_before(jiffies, timeout));
+
+ dev_err(dev, "Timeout resetting block (ctrl %#x)\n", ctrl);
+
+ return false;
+}
+
+static int dw_mci_probe(struct platform_device *pdev)
+{
+ struct dw_mci *host;
+ struct resource *regs;
+ struct dw_mci_board *pdata;
+ int irq, ret, i, width;
+ u32 fifo_size;
+
+ regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!regs)
+ return -ENXIO;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ host = kzalloc(sizeof(struct dw_mci), GFP_KERNEL);
+ if (!host)
+ return -ENOMEM;
+
+ host->pdev = pdev;
+ host->pdata = pdata = pdev->dev.platform_data;
+ if (!pdata || !pdata->init) {
+ dev_err(&pdev->dev,
+ "Platform data must supply init function\n");
+ ret = -ENODEV;
+ goto err_freehost;
+ }
+
+ if (!pdata->select_slot && pdata->num_slots > 1) {
+ dev_err(&pdev->dev,
+ "Platform data must supply select_slot function\n");
+ ret = -ENODEV;
+ goto err_freehost;
+ }
+
+ if (!pdata->bus_hz) {
+ dev_err(&pdev->dev,
+ "Platform data must supply bus speed\n");
+ ret = -ENODEV;
+ goto err_freehost;
+ }
+
+ host->bus_hz = pdata->bus_hz;
+ host->quirks = pdata->quirks;
+
+ spin_lock_init(&host->lock);
+ INIT_LIST_HEAD(&host->queue);
+
+ ret = -ENOMEM;
+ host->regs = ioremap(regs->start, regs->end - regs->start + 1);
+ if (!host->regs)
+ goto err_freehost;
+
+ host->dma_ops = pdata->dma_ops;
+ dw_mci_init_dma(host);
+
+ /*
+ * Get the host data width - this assumes that HCON has been set with
+ * the correct values.
+ */
+ i = (mci_readl(host, HCON) >> 7) & 0x7;
+ if (!i) {
+ host->push_data = dw_mci_push_data16;
+ host->pull_data = dw_mci_pull_data16;
+ width = 16;
+ host->data_shift = 1;
+ } else if (i == 2) {
+ host->push_data = dw_mci_push_data64;
+ host->pull_data = dw_mci_pull_data64;
+ width = 64;
+ host->data_shift = 3;
+ } else {
+ /* Check for a reserved value, and warn if it is */
+ WARN((i != 1),
+ "HCON reports a reserved host data width!\n"
+ "Defaulting to 32-bit access.\n");
+ host->push_data = dw_mci_push_data32;
+ host->pull_data = dw_mci_pull_data32;
+ width = 32;
+ host->data_shift = 2;
+ }
+
+ /* Reset all blocks */
+ if (!mci_wait_reset(&pdev->dev, host)) {
+ ret = -ENODEV;
+ goto err_dmaunmap;
+ }
+
+ /* Clear the interrupts for the host controller */
+ mci_writel(host, RINTSTS, 0xFFFFFFFF);
+ mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
+
+ /* Put in max timeout */
+ mci_writel(host, TMOUT, 0xFFFFFFFF);
+
+ /*
+ * FIFO threshold settings RxMark = fifo_size / 2 - 1,
+ * Tx Mark = fifo_size / 2 DMA Size = 8
+ */
+ fifo_size = mci_readl(host, FIFOTH);
+ fifo_size = (fifo_size >> 16) & 0x7ff;
+ mci_writel(host, FIFOTH, ((0x2 << 28) | ((fifo_size/2 - 1) << 16) |
+ ((fifo_size/2) << 0)));
+
+ /* disable clock to CIU */
+ mci_writel(host, CLKENA, 0);
+ mci_writel(host, CLKSRC, 0);
+
+ tasklet_init(&host->tasklet, dw_mci_tasklet_func, (unsigned long)host);
+ tasklet_init(&host->card_tasklet,
+ dw_mci_tasklet_card, (unsigned long)host);
+
+ ret = request_irq(irq, dw_mci_interrupt, 0, "dw-mci", host);
+ if (ret)
+ goto err_dmaunmap;
+
+ platform_set_drvdata(pdev, host);
+
+ if (host->pdata->num_slots)
+ host->num_slots = host->pdata->num_slots;
+ else
+ host->num_slots = ((mci_readl(host, HCON) >> 1) & 0x1F) + 1;
+
+ /* We need at least one slot to succeed */
+ for (i = 0; i < host->num_slots; i++) {
+ ret = dw_mci_init_slot(host, i);
+ if (ret) {
+ ret = -ENODEV;
+ goto err_init_slot;
+ }
+ }
+
+ /*
+ * Enable interrupts for command done, data over, data empty, card det,
+ * receive ready and error such as transmit, receive timeout, crc error
+ */
+ mci_writel(host, RINTSTS, 0xFFFFFFFF);
+ mci_writel(host, INTMASK, SDMMC_INT_CMD_DONE | SDMMC_INT_DATA_OVER |
+ SDMMC_INT_TXDR | SDMMC_INT_RXDR |
+ DW_MCI_ERROR_FLAGS | SDMMC_INT_CD);
+ mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */
+
+ dev_info(&pdev->dev, "DW MMC controller at irq %d, "
+ "%d bit host data width\n", irq, width);
+ if (host->quirks & DW_MCI_QUIRK_IDMAC_DTO)
+ dev_info(&pdev->dev, "Internal DMAC interrupt fix enabled.\n");
+
+ return 0;
+
+err_init_slot:
+ /* De-init any initialized slots */
+ while (i > 0) {
+ if (host->slot[i])
+ dw_mci_cleanup_slot(host->slot[i], i);
+ i--;
+ }
+ free_irq(irq, host);
+
+err_dmaunmap:
+ if (host->use_dma && host->dma_ops->exit)
+ host->dma_ops->exit(host);
+ dma_free_coherent(&host->pdev->dev, PAGE_SIZE,
+ host->sg_cpu, host->sg_dma);
+ iounmap(host->regs);
+
+err_freehost:
+ kfree(host);
+ return ret;
+}
+
+static int __exit dw_mci_remove(struct platform_device *pdev)
+{
+ struct dw_mci *host = platform_get_drvdata(pdev);
+ int i;
+
+ mci_writel(host, RINTSTS, 0xFFFFFFFF);
+ mci_writel(host, INTMASK, 0); /* disable all mmc interrupt first */
+
+ platform_set_drvdata(pdev, NULL);
+
+ for (i = 0; i < host->num_slots; i++) {
+ dev_dbg(&pdev->dev, "remove slot %d\n", i);
+ if (host->slot[i])
+ dw_mci_cleanup_slot(host->slot[i], i);
+ }
+
+ /* disable clock to CIU */
+ mci_writel(host, CLKENA, 0);
+ mci_writel(host, CLKSRC, 0);
+
+ free_irq(platform_get_irq(pdev, 0), host);
+ dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
+
+ if (host->use_dma && host->dma_ops->exit)
+ host->dma_ops->exit(host);
+
+ iounmap(host->regs);
+
+ kfree(host);
+ return 0;
+}
+
+#ifdef CONFIG_PM
+/*
+ * TODO: we should probably disable the clock to the card in the suspend path.
+ */
+static int dw_mci_suspend(struct platform_device *pdev, pm_message_t mesg)
+{
+ int i, ret;
+ struct dw_mci *host = platform_get_drvdata(pdev);
+
+ for (i = 0; i < host->num_slots; i++) {
+ struct dw_mci_slot *slot = host->slot[i];
+ if (!slot)
+ continue;
+ ret = mmc_suspend_host(slot->mmc);
+ if (ret < 0) {
+ while (--i >= 0) {
+ slot = host->slot[i];
+ if (slot)
+ mmc_resume_host(host->slot[i]->mmc);
+ }
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int dw_mci_resume(struct platform_device *pdev)
+{
+ int i, ret;
+ struct dw_mci *host = platform_get_drvdata(pdev);
+
+ for (i = 0; i < host->num_slots; i++) {
+ struct dw_mci_slot *slot = host->slot[i];
+ if (!slot)
+ continue;
+ ret = mmc_resume_host(host->slot[i]->mmc);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+#else
+#define dw_mci_suspend NULL
+#define dw_mci_resume NULL
+#endif /* CONFIG_PM */
+
+static struct platform_driver dw_mci_driver = {
+ .remove = __exit_p(dw_mci_remove),
+ .suspend = dw_mci_suspend,
+ .resume = dw_mci_resume,
+ .driver = {
+ .name = "dw_mmc",
+ },
+};
+
+static int __init dw_mci_init(void)
+{
+ return platform_driver_probe(&dw_mci_driver, dw_mci_probe);
+}
+
+static void __exit dw_mci_exit(void)
+{
+ platform_driver_unregister(&dw_mci_driver);
+}
+
+module_init(dw_mci_init);
+module_exit(dw_mci_exit);
+
+MODULE_DESCRIPTION("DW Multimedia Card Interface driver");
+MODULE_AUTHOR("NXP Semiconductor VietNam");
+MODULE_AUTHOR("Imagination Technologies Ltd");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
new file mode 100644
index 0000000..5dd55a7
--- /dev/null
+++ b/drivers/mmc/host/dw_mmc.h
@@ -0,0 +1,168 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ * (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _DW_MMC_H_
+#define _DW_MMC_H_
+
+#define SDMMC_CTRL 0x000
+#define SDMMC_PWREN 0x004
+#define SDMMC_CLKDIV 0x008
+#define SDMMC_CLKSRC 0x00c
+#define SDMMC_CLKENA 0x010
+#define SDMMC_TMOUT 0x014
+#define SDMMC_CTYPE 0x018
+#define SDMMC_BLKSIZ 0x01c
+#define SDMMC_BYTCNT 0x020
+#define SDMMC_INTMASK 0x024
+#define SDMMC_CMDARG 0x028
+#define SDMMC_CMD 0x02c
+#define SDMMC_RESP0 0x030
+#define SDMMC_RESP1 0x034
+#define SDMMC_RESP2 0x038
+#define SDMMC_RESP3 0x03c
+#define SDMMC_MINTSTS 0x040
+#define SDMMC_RINTSTS 0x044
+#define SDMMC_STATUS 0x048
+#define SDMMC_FIFOTH 0x04c
+#define SDMMC_CDETECT 0x050
+#define SDMMC_WRTPRT 0x054
+#define SDMMC_GPIO 0x058
+#define SDMMC_TCBCNT 0x05c
+#define SDMMC_TBBCNT 0x060
+#define SDMMC_DEBNCE 0x064
+#define SDMMC_USRID 0x068
+#define SDMMC_VERID 0x06c
+#define SDMMC_HCON 0x070
+#define SDMMC_BMOD 0x080
+#define SDMMC_PLDMND 0x084
+#define SDMMC_DBADDR 0x088
+#define SDMMC_IDSTS 0x08c
+#define SDMMC_IDINTEN 0x090
+#define SDMMC_DSCADDR 0x094
+#define SDMMC_BUFADDR 0x098
+#define SDMMC_DATA 0x100
+#define SDMMC_DATA_ADR 0x100
+
+/* shift bit field */
+#define _SBF(f, v) ((v) << (f))
+
+/* Control register defines */
+#define SDMMC_CTRL_USE_IDMAC BIT(25)
+#define SDMMC_CTRL_CEATA_INT_EN BIT(11)
+#define SDMMC_CTRL_SEND_AS_CCSD BIT(10)
+#define SDMMC_CTRL_SEND_CCSD BIT(9)
+#define SDMMC_CTRL_ABRT_READ_DATA BIT(8)
+#define SDMMC_CTRL_SEND_IRQ_RESP BIT(7)
+#define SDMMC_CTRL_READ_WAIT BIT(6)
+#define SDMMC_CTRL_DMA_ENABLE BIT(5)
+#define SDMMC_CTRL_INT_ENABLE BIT(4)
+#define SDMMC_CTRL_DMA_RESET BIT(2)
+#define SDMMC_CTRL_FIFO_RESET BIT(1)
+#define SDMMC_CTRL_RESET BIT(0)
+/* Clock Enable register defines */
+#define SDMMC_CLKEN_LOW_PWR BIT(16)
+#define SDMMC_CLKEN_ENABLE BIT(0)
+/* time-out register defines */
+#define SDMMC_TMOUT_DATA(n) _SBF(8, (n))
+#define SDMMC_TMOUT_DATA_MSK 0xFFFFFF00
+#define SDMMC_TMOUT_RESP(n) ((n) & 0xFF)
+#define SDMMC_TMOUT_RESP_MSK 0xFF
+/* card-type register defines */
+#define SDMMC_CTYPE_8BIT BIT(16)
+#define SDMMC_CTYPE_4BIT BIT(0)
+#define SDMMC_CTYPE_1BIT 0
+/* Interrupt status & mask register defines */
+#define SDMMC_INT_SDIO BIT(16)
+#define SDMMC_INT_EBE BIT(15)
+#define SDMMC_INT_ACD BIT(14)
+#define SDMMC_INT_SBE BIT(13)
+#define SDMMC_INT_HLE BIT(12)
+#define SDMMC_INT_FRUN BIT(11)
+#define SDMMC_INT_HTO BIT(10)
+#define SDMMC_INT_DTO BIT(9)
+#define SDMMC_INT_RTO BIT(8)
+#define SDMMC_INT_DCRC BIT(7)
+#define SDMMC_INT_RCRC BIT(6)
+#define SDMMC_INT_RXDR BIT(5)
+#define SDMMC_INT_TXDR BIT(4)
+#define SDMMC_INT_DATA_OVER BIT(3)
+#define SDMMC_INT_CMD_DONE BIT(2)
+#define SDMMC_INT_RESP_ERR BIT(1)
+#define SDMMC_INT_CD BIT(0)
+#define SDMMC_INT_ERROR 0xbfc2
+/* Command register defines */
+#define SDMMC_CMD_START BIT(31)
+#define SDMMC_CMD_CCS_EXP BIT(23)
+#define SDMMC_CMD_CEATA_RD BIT(22)
+#define SDMMC_CMD_UPD_CLK BIT(21)
+#define SDMMC_CMD_INIT BIT(15)
+#define SDMMC_CMD_STOP BIT(14)
+#define SDMMC_CMD_PRV_DAT_WAIT BIT(13)
+#define SDMMC_CMD_SEND_STOP BIT(12)
+#define SDMMC_CMD_STRM_MODE BIT(11)
+#define SDMMC_CMD_DAT_WR BIT(10)
+#define SDMMC_CMD_DAT_EXP BIT(9)
+#define SDMMC_CMD_RESP_CRC BIT(8)
+#define SDMMC_CMD_RESP_LONG BIT(7)
+#define SDMMC_CMD_RESP_EXP BIT(6)
+#define SDMMC_CMD_INDX(n) ((n) & 0x1F)
+/* Status register defines */
+#define SDMMC_GET_FCNT(x) (((x)>>17) & 0x1FF)
+#define SDMMC_FIFO_SZ 32
+/* Internal DMAC interrupt defines */
+#define SDMMC_IDMAC_INT_AI BIT(9)
+#define SDMMC_IDMAC_INT_NI BIT(8)
+#define SDMMC_IDMAC_INT_CES BIT(5)
+#define SDMMC_IDMAC_INT_DU BIT(4)
+#define SDMMC_IDMAC_INT_FBE BIT(2)
+#define SDMMC_IDMAC_INT_RI BIT(1)
+#define SDMMC_IDMAC_INT_TI BIT(0)
+/* Internal DMAC bus mode bits */
+#define SDMMC_IDMAC_ENABLE BIT(7)
+#define SDMMC_IDMAC_FB BIT(1)
+#define SDMMC_IDMAC_SWRESET BIT(0)
+
+/* Register access macros */
+#define mci_readl(dev, reg) \
+ __raw_readl(dev->regs + SDMMC_##reg)
+#define mci_writel(dev, reg, value) \
+ __raw_writel((value), dev->regs + SDMMC_##reg)
+
+/* 16-bit FIFO access macros */
+#define mci_readw(dev, reg) \
+ __raw_readw(dev->regs + SDMMC_##reg)
+#define mci_writew(dev, reg, value) \
+ __raw_writew((value), dev->regs + SDMMC_##reg)
+
+/* 64-bit FIFO access macros */
+#ifdef readq
+#define mci_readq(dev, reg) \
+ __raw_readq(dev->regs + SDMMC_##reg)
+#define mci_writeq(dev, reg, value) \
+ __raw_writeq((value), dev->regs + SDMMC_##reg)
+#else
+/*
+ * Dummy readq implementation for architectures that don't define it.
+ *
+ * We would assume that none of these architectures would configure
+ * the IP block with a 64bit FIFO width, so this code will never be
+ * executed on those machines. Defining these macros here keeps the
+ * rest of the code free from ifdefs.
+ */
+#define mci_readq(dev, reg) \
+ (*(volatile u64 __force *)(dev->regs + SDMMC_##reg))
+#define mci_writeq(dev, reg, value) \
+ (*(volatile u64 __force *)(dev->regs + SDMMC_##reg) = value)
+#endif
+
+#endif /* _DW_MMC_H_ */
diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c
index bdd2cbb..4428594 100644
--- a/drivers/mmc/host/mxcmmc.c
+++ b/drivers/mmc/host/mxcmmc.c
@@ -31,6 +31,7 @@
#include <linux/clk.h>
#include <linux/io.h>
#include <linux/gpio.h>
+#include <linux/regulator/consumer.h>
#include <asm/dma.h>
#include <asm/irq.h>
@@ -141,10 +142,49 @@ struct mxcmci_host {
struct work_struct datawork;
spinlock_t lock;
+
+ struct regulator *vcc;
};
static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios);
+static inline void mxcmci_init_ocr(struct mxcmci_host *host)
+{
+ host->vcc = regulator_get(mmc_dev(host->mmc), "vmmc");
+
+ if (IS_ERR(host->vcc)) {
+ host->vcc = NULL;
+ } else {
+ host->mmc->ocr_avail = mmc_regulator_get_ocrmask(host->vcc);
+ if (host->pdata && host->pdata->ocr_avail)
+ dev_warn(mmc_dev(host->mmc),
+ "pdata->ocr_avail will not be used\n");
+ }
+
+ if (host->vcc == NULL) {
+ /* fall-back to platform data */
+ if (host->pdata && host->pdata->ocr_avail)
+ host->mmc->ocr_avail = host->pdata->ocr_avail;
+ else
+ host->mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
+ }
+}
+
+static inline void mxcmci_set_power(struct mxcmci_host *host,
+ unsigned char power_mode,
+ unsigned int vdd)
+{
+ if (host->vcc) {
+ if (power_mode == MMC_POWER_UP)
+ mmc_regulator_set_ocr(host->mmc, host->vcc, vdd);
+ else if (power_mode == MMC_POWER_OFF)
+ mmc_regulator_set_ocr(host->mmc, host->vcc, 0);
+ }
+
+ if (host->pdata && host->pdata->setpower)
+ host->pdata->setpower(mmc_dev(host->mmc), vdd);
+}
+
static inline int mxcmci_use_dma(struct mxcmci_host *host)
{
return host->do_dma;
@@ -680,9 +720,9 @@ static void mxcmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
host->cmdat &= ~CMD_DAT_CONT_BUS_WIDTH_4;
if (host->power_mode != ios->power_mode) {
- if (host->pdata && host->pdata->setpower)
- host->pdata->setpower(mmc_dev(mmc), ios->vdd);
+ mxcmci_set_power(host, ios->power_mode, ios->vdd);
host->power_mode = ios->power_mode;
+
if (ios->power_mode == MMC_POWER_ON)
host->cmdat |= CMD_DAT_CONT_INIT;
}
@@ -807,10 +847,7 @@ static int mxcmci_probe(struct platform_device *pdev)
host->pdata = pdev->dev.platform_data;
spin_lock_init(&host->lock);
- if (host->pdata && host->pdata->ocr_avail)
- mmc->ocr_avail = host->pdata->ocr_avail;
- else
- mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34;
+ mxcmci_init_ocr(host);
if (host->pdata && host->pdata->dat3_card_detect)
host->default_irq_mask =
@@ -915,6 +952,9 @@ static int mxcmci_remove(struct platform_device *pdev)
mmc_remove_host(mmc);
+ if (host->vcc)
+ regulator_put(host->vcc);
+
if (host->pdata && host->pdata->exit)
host->pdata->exit(&pdev->dev, mmc);
@@ -927,7 +967,6 @@ static int mxcmci_remove(struct platform_device *pdev)
clk_put(host->clk);
release_mem_region(host->res->start, resource_size(host->res));
- release_resource(host->res);
mmc_free_host(mmc);
diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c
new file mode 100644
index 0000000..2aeef4f
--- /dev/null
+++ b/drivers/mmc/host/sdhci-dove.c
@@ -0,0 +1,70 @@
+/*
+ * sdhci-dove.c Support for SDHCI on Marvell's Dove SoC
+ *
+ * Author: Saeed Bishara <saeed@marvell.com>
+ * Mike Rapoport <mike@compulab.co.il>
+ * Based on sdhci-cns3xxx.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/io.h>
+#include <linux/mmc/host.h>
+
+#include "sdhci.h"
+#include "sdhci-pltfm.h"
+
+static u16 sdhci_dove_readw(struct sdhci_host *host, int reg)
+{
+ u16 ret;
+
+ switch (reg) {
+ case SDHCI_HOST_VERSION:
+ case SDHCI_SLOT_INT_STATUS:
+ /* those registers don't exist */
+ return 0;
+ default:
+ ret = readw(host->ioaddr + reg);
+ }
+ return ret;
+}
+
+static u32 sdhci_dove_readl(struct sdhci_host *host, int reg)
+{
+ u32 ret;
+
+ switch (reg) {
+ case SDHCI_CAPABILITIES:
+ ret = readl(host->ioaddr + reg);
+ /* Mask the support for 3.0V */
+ ret &= ~SDHCI_CAN_VDD_300;
+ break;
+ default:
+ ret = readl(host->ioaddr + reg);
+ }
+ return ret;
+}
+
+static struct sdhci_ops sdhci_dove_ops = {
+ .read_w = sdhci_dove_readw,
+ .read_l = sdhci_dove_readl,
+};
+
+struct sdhci_pltfm_data sdhci_dove_pdata = {
+ .ops = &sdhci_dove_ops,
+ .quirks = SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER |
+ SDHCI_QUIRK_NO_BUSY_IRQ |
+ SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
+ SDHCI_QUIRK_FORCE_DMA,
+};
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 3d9c246..0dc905b 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -176,6 +176,74 @@ static const struct sdhci_pci_fixes sdhci_intel_mfd_emmc_sdio = {
.quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
};
+/* O2Micro extra registers */
+#define O2_SD_LOCK_WP 0xD3
+#define O2_SD_MULTI_VCC3V 0xEE
+#define O2_SD_CLKREQ 0xEC
+#define O2_SD_CAPS 0xE0
+#define O2_SD_ADMA1 0xE2
+#define O2_SD_ADMA2 0xE7
+#define O2_SD_INF_MOD 0xF1
+
+static int o2_probe(struct sdhci_pci_chip *chip)
+{
+ int ret;
+ u8 scratch;
+
+ switch (chip->pdev->device) {
+ case PCI_DEVICE_ID_O2_8220:
+ case PCI_DEVICE_ID_O2_8221:
+ case PCI_DEVICE_ID_O2_8320:
+ case PCI_DEVICE_ID_O2_8321:
+ /* This extra setup is required due to broken ADMA. */
+ ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
+ if (ret)
+ return ret;
+ scratch &= 0x7f;
+ pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
+
+ /* Set Multi 3 to VCC3V# */
+ pci_write_config_byte(chip->pdev, O2_SD_MULTI_VCC3V, 0x08);
+
+ /* Disable CLK_REQ# support after media DET */
+ ret = pci_read_config_byte(chip->pdev, O2_SD_CLKREQ, &scratch);
+ if (ret)
+ return ret;
+ scratch |= 0x20;
+ pci_write_config_byte(chip->pdev, O2_SD_CLKREQ, scratch);
+
+ /* Choose capabilities, enable SDMA. We have to write 0x01
+ * to the capabilities register first to unlock it.
+ */
+ ret = pci_read_config_byte(chip->pdev, O2_SD_CAPS, &scratch);
+ if (ret)
+ return ret;
+ scratch |= 0x01;
+ pci_write_config_byte(chip->pdev, O2_SD_CAPS, scratch);
+ pci_write_config_byte(chip->pdev, O2_SD_CAPS, 0x73);
+
+ /* Disable ADMA1/2 */
+ pci_write_config_byte(chip->pdev, O2_SD_ADMA1, 0x39);
+ pci_write_config_byte(chip->pdev, O2_SD_ADMA2, 0x08);
+
+ /* Disable the infinite transfer mode */
+ ret = pci_read_config_byte(chip->pdev, O2_SD_INF_MOD, &scratch);
+ if (ret)
+ return ret;
+ scratch |= 0x08;
+ pci_write_config_byte(chip->pdev, O2_SD_INF_MOD, scratch);
+
+ /* Lock WP */
+ ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch);
+ if (ret)
+ return ret;
+ scratch |= 0x80;
+ pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch);
+ }
+
+ return 0;
+}
+
static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
{
u8 scratch;
@@ -204,6 +272,7 @@ static int jmicron_pmos(struct sdhci_pci_chip *chip, int on)
static int jmicron_probe(struct sdhci_pci_chip *chip)
{
int ret;
+ u16 mmcdev = 0;
if (chip->pdev->revision == 0) {
chip->quirks |= SDHCI_QUIRK_32BIT_DMA_ADDR |
@@ -225,12 +294,17 @@ static int jmicron_probe(struct sdhci_pci_chip *chip)
* 2. The MMC interface has a lower subfunction number
* than the SD interface.
*/
- if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD) {
+ if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_SD)
+ mmcdev = PCI_DEVICE_ID_JMICRON_JMB38X_MMC;
+ else if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_SD)
+ mmcdev = PCI_DEVICE_ID_JMICRON_JMB388_ESD;
+
+ if (mmcdev) {
struct pci_dev *sd_dev;
sd_dev = NULL;
while ((sd_dev = pci_get_device(PCI_VENDOR_ID_JMICRON,
- PCI_DEVICE_ID_JMICRON_JMB38X_MMC, sd_dev)) != NULL) {
+ mmcdev, sd_dev)) != NULL) {
if ((PCI_SLOT(chip->pdev->devfn) ==
PCI_SLOT(sd_dev->devfn)) &&
(chip->pdev->bus == sd_dev->bus))
@@ -290,13 +364,25 @@ static int jmicron_probe_slot(struct sdhci_pci_slot *slot)
slot->host->quirks |= SDHCI_QUIRK_BROKEN_ADMA;
}
+ /* JM388 MMC doesn't support 1.8V while SD supports it */
+ if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
+ slot->host->ocr_avail_sd = MMC_VDD_32_33 | MMC_VDD_33_34 |
+ MMC_VDD_29_30 | MMC_VDD_30_31 |
+ MMC_VDD_165_195; /* allow 1.8V */
+ slot->host->ocr_avail_mmc = MMC_VDD_32_33 | MMC_VDD_33_34 |
+ MMC_VDD_29_30 | MMC_VDD_30_31; /* no 1.8V for MMC */
+ }
+
/*
* The secondary interface requires a bit set to get the
* interrupts.
*/
- if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
+ if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+ slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
jmicron_enable_mmc(slot->host, 1);
+ slot->host->mmc->caps |= MMC_CAP_BUS_WIDTH_TEST;
+
return 0;
}
@@ -305,7 +391,8 @@ static void jmicron_remove_slot(struct sdhci_pci_slot *slot, int dead)
if (dead)
return;
- if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC)
+ if (slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+ slot->chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD)
jmicron_enable_mmc(slot->host, 0);
}
@@ -313,7 +400,8 @@ static int jmicron_suspend(struct sdhci_pci_chip *chip, pm_message_t state)
{
int i;
- if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
+ if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+ chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
for (i = 0;i < chip->num_slots;i++)
jmicron_enable_mmc(chip->slots[i]->host, 0);
}
@@ -325,7 +413,8 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
{
int ret, i;
- if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC) {
+ if (chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB38X_MMC ||
+ chip->pdev->device == PCI_DEVICE_ID_JMICRON_JMB388_ESD) {
for (i = 0;i < chip->num_slots;i++)
jmicron_enable_mmc(chip->slots[i]->host, 1);
}
@@ -339,6 +428,10 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
return 0;
}
+static const struct sdhci_pci_fixes sdhci_o2 = {
+ .probe = o2_probe,
+};
+
static const struct sdhci_pci_fixes sdhci_jmicron = {
.probe = jmicron_probe,
@@ -510,6 +603,22 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
},
{
+ .vendor = PCI_VENDOR_ID_JMICRON,
+ .device = PCI_DEVICE_ID_JMICRON_JMB388_SD,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_jmicron,
+ },
+
+ {
+ .vendor = PCI_VENDOR_ID_JMICRON,
+ .device = PCI_DEVICE_ID_JMICRON_JMB388_ESD,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_jmicron,
+ },
+
+ {
.vendor = PCI_VENDOR_ID_SYSKONNECT,
.device = 0x8000,
.subvendor = PCI_ANY_ID,
@@ -589,6 +698,46 @@ static const struct pci_device_id pci_ids[] __devinitdata = {
.driver_data = (kernel_ulong_t)&sdhci_intel_mfd_emmc_sdio,
},
+ {
+ .vendor = PCI_VENDOR_ID_O2,
+ .device = PCI_DEVICE_ID_O2_8120,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_o2,
+ },
+
+ {
+ .vendor = PCI_VENDOR_ID_O2,
+ .device = PCI_DEVICE_ID_O2_8220,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_o2,
+ },
+
+ {
+ .vendor = PCI_VENDOR_ID_O2,
+ .device = PCI_DEVICE_ID_O2_8221,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_o2,
+ },
+
+ {
+ .vendor = PCI_VENDOR_ID_O2,
+ .device = PCI_DEVICE_ID_O2_8320,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_o2,
+ },
+
+ {
+ .vendor = PCI_VENDOR_ID_O2,
+ .device = PCI_DEVICE_ID_O2_8321,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (kernel_ulong_t)&sdhci_o2,
+ },
+
{ /* Generic SD host controller */
PCI_DEVICE_CLASS((PCI_CLASS_SYSTEM_SDHCI << 8), 0xFFFF00)
},
diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c
index 0502f89..dbab040 100644
--- a/drivers/mmc/host/sdhci-pltfm.c
+++ b/drivers/mmc/host/sdhci-pltfm.c
@@ -170,6 +170,12 @@ static const struct platform_device_id sdhci_pltfm_ids[] = {
#ifdef CONFIG_MMC_SDHCI_ESDHC_IMX
{ "sdhci-esdhc-imx", (kernel_ulong_t)&sdhci_esdhc_imx_pdata },
#endif
+#ifdef CONFIG_MMC_SDHCI_DOVE
+ { "sdhci-dove", (kernel_ulong_t)&sdhci_dove_pdata },
+#endif
+#ifdef CONFIG_MMC_SDHCI_TEGRA
+ { "sdhci-tegra", (kernel_ulong_t)&sdhci_tegra_pdata },
+#endif
{ },
};
MODULE_DEVICE_TABLE(platform, sdhci_pltfm_ids);
diff --git a/drivers/mmc/host/sdhci-pltfm.h b/drivers/mmc/host/sdhci-pltfm.h
index c1bfe48..ea2e44d 100644
--- a/drivers/mmc/host/sdhci-pltfm.h
+++ b/drivers/mmc/host/sdhci-pltfm.h
@@ -22,5 +22,7 @@ struct sdhci_pltfm_host {
extern struct sdhci_pltfm_data sdhci_cns3xxx_pdata;
extern struct sdhci_pltfm_data sdhci_esdhc_imx_pdata;
+extern struct sdhci_pltfm_data sdhci_dove_pdata;
+extern struct sdhci_pltfm_data sdhci_tegra_pdata;
#endif /* _DRIVERS_MMC_SDHCI_PLTFM_H */
diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c
index aacb862..1720358 100644
--- a/drivers/mmc/host/sdhci-s3c.c
+++ b/drivers/mmc/host/sdhci-s3c.c
@@ -130,6 +130,15 @@ static unsigned int sdhci_s3c_consider_clock(struct sdhci_s3c *ourhost,
if (!clksrc)
return UINT_MAX;
+ /*
+ * Clock divider's step is different as 1 from that of host controller
+ * when 'clk_type' is S3C_SDHCI_CLK_DIV_EXTERNAL.
+ */
+ if (ourhost->pdata->clk_type) {
+ rate = clk_round_rate(clksrc, wanted);
+ return wanted - rate;
+ }
+
rate = clk_get_rate(clksrc);
for (div = 1; div < 256; div *= 2) {
@@ -232,6 +241,42 @@ static unsigned int sdhci_s3c_get_min_clock(struct sdhci_host *host)
return min;
}
+/* sdhci_cmu_get_max_clk - callback to get maximum clock frequency.*/
+static unsigned int sdhci_cmu_get_max_clock(struct sdhci_host *host)
+{
+ struct sdhci_s3c *ourhost = to_s3c(host);
+
+ return clk_round_rate(ourhost->clk_bus[ourhost->cur_clk], UINT_MAX);
+}
+
+/* sdhci_cmu_get_min_clock - callback to get minimal supported clock value. */
+static unsigned int sdhci_cmu_get_min_clock(struct sdhci_host *host)
+{
+ struct sdhci_s3c *ourhost = to_s3c(host);
+
+ /*
+ * initial clock can be in the frequency range of
+ * 100KHz-400KHz, so we set it as max value.
+ */
+ return clk_round_rate(ourhost->clk_bus[ourhost->cur_clk], 400000);
+}
+
+/* sdhci_cmu_set_clock - callback on clock change.*/
+static void sdhci_cmu_set_clock(struct sdhci_host *host, unsigned int clock)
+{
+ struct sdhci_s3c *ourhost = to_s3c(host);
+
+ /* don't bother if the clock is going off */
+ if (clock == 0)
+ return;
+
+ sdhci_s3c_set_clock(host, clock);
+
+ clk_set_rate(ourhost->clk_bus[ourhost->cur_clk], clock);
+
+ host->clock = clock;
+}
+
static struct sdhci_ops sdhci_s3c_ops = {
.get_max_clock = sdhci_s3c_get_max_clk,
.set_clock = sdhci_s3c_set_clock,
@@ -361,6 +406,13 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev)
clks++;
sc->clk_bus[ptr] = clk;
+
+ /*
+ * save current clock index to know which clock bus
+ * is used later in overriding functions.
+ */
+ sc->cur_clk = ptr;
+
clk_enable(clk);
dev_info(dev, "clock source %d: %s (%ld Hz)\n",
@@ -427,6 +479,20 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev)
/* HSMMC on Samsung SoCs uses SDCLK as timeout clock */
host->quirks |= SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK;
+ /*
+ * If controller does not have internal clock divider,
+ * we can use overriding functions instead of default.
+ */
+ if (pdata->clk_type) {
+ sdhci_s3c_ops.set_clock = sdhci_cmu_set_clock;
+ sdhci_s3c_ops.get_min_clock = sdhci_cmu_get_min_clock;
+ sdhci_s3c_ops.get_max_clock = sdhci_cmu_get_max_clock;
+ }
+
+ /* It supports additional host capabilities if needed */
+ if (pdata->host_caps)
+ host->mmc->caps |= pdata->host_caps;
+
ret = sdhci_add_host(host);
if (ret) {
dev_err(dev, "sdhci_add_host() failed\n");
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
new file mode 100644
index 0000000..4823ee9
--- /dev/null
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/gpio.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/host.h>
+
+#include <mach/gpio.h>
+#include <mach/sdhci.h>
+
+#include "sdhci.h"
+#include "sdhci-pltfm.h"
+
+static u32 tegra_sdhci_readl(struct sdhci_host *host, int reg)
+{
+ u32 val;
+
+ if (unlikely(reg == SDHCI_PRESENT_STATE)) {
+ /* Use wp_gpio here instead? */
+ val = readl(host->ioaddr + reg);
+ return val | SDHCI_WRITE_PROTECT;
+ }
+
+ return readl(host->ioaddr + reg);
+}
+
+static u16 tegra_sdhci_readw(struct sdhci_host *host, int reg)
+{
+ if (unlikely(reg == SDHCI_HOST_VERSION)) {
+ /* Erratum: Version register is invalid in HW. */
+ return SDHCI_SPEC_200;
+ }
+
+ return readw(host->ioaddr + reg);
+}
+
+static void tegra_sdhci_writel(struct sdhci_host *host, u32 val, int reg)
+{
+ /* Seems like we're getting spurious timeout and crc errors, so
+ * disable signalling of them. In case of real errors software
+ * timers should take care of eventually detecting them.
+ */
+ if (unlikely(reg == SDHCI_SIGNAL_ENABLE))
+ val &= ~(SDHCI_INT_TIMEOUT|SDHCI_INT_CRC);
+
+ writel(val, host->ioaddr + reg);
+
+ if (unlikely(reg == SDHCI_INT_ENABLE)) {
+ /* Erratum: Must enable block gap interrupt detection */
+ u8 gap_ctrl = readb(host->ioaddr + SDHCI_BLOCK_GAP_CONTROL);
+ if (val & SDHCI_INT_CARD_INT)
+ gap_ctrl |= 0x8;
+ else
+ gap_ctrl &= ~0x8;
+ writeb(gap_ctrl, host->ioaddr + SDHCI_BLOCK_GAP_CONTROL);
+ }
+}
+
+static unsigned int tegra_sdhci_get_ro(struct sdhci_host *sdhci)
+{
+ struct platform_device *pdev = to_platform_device(mmc_dev(sdhci->mmc));
+ struct tegra_sdhci_platform_data *plat;
+
+ plat = pdev->dev.platform_data;
+
+ if (!gpio_is_valid(plat->wp_gpio))
+ return -1;
+
+ return gpio_get_value(plat->wp_gpio);
+}
+
+static irqreturn_t carddetect_irq(int irq, void *data)
+{
+ struct sdhci_host *sdhost = (struct sdhci_host *)data;
+
+ tasklet_schedule(&sdhost->card_tasklet);
+ return IRQ_HANDLED;
+};
+
+static int tegra_sdhci_8bit(struct sdhci_host *host, int bus_width)
+{
+ struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
+ struct tegra_sdhci_platform_data *plat;
+ u32 ctrl;
+
+ plat = pdev->dev.platform_data;
+
+ ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL);
+ if (plat->is_8bit && bus_width == MMC_BUS_WIDTH_8) {
+ ctrl &= ~SDHCI_CTRL_4BITBUS;
+ ctrl |= SDHCI_CTRL_8BITBUS;
+ } else {
+ ctrl &= ~SDHCI_CTRL_8BITBUS;
+ if (bus_width == MMC_BUS_WIDTH_4)
+ ctrl |= SDHCI_CTRL_4BITBUS;
+ else
+ ctrl &= ~SDHCI_CTRL_4BITBUS;
+ }
+ sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL);
+ return 0;
+}
+
+
+static int tegra_sdhci_pltfm_init(struct sdhci_host *host,
+ struct sdhci_pltfm_data *pdata)
+{
+ struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+ struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
+ struct tegra_sdhci_platform_data *plat;
+ struct clk *clk;
+ int rc;
+
+ plat = pdev->dev.platform_data;
+ if (plat == NULL) {
+ dev_err(mmc_dev(host->mmc), "missing platform data\n");
+ return -ENXIO;
+ }
+
+ if (gpio_is_valid(plat->power_gpio)) {
+ rc = gpio_request(plat->power_gpio, "sdhci_power");
+ if (rc) {
+ dev_err(mmc_dev(host->mmc),
+ "failed to allocate power gpio\n");
+ goto out;
+ }
+ tegra_gpio_enable(plat->power_gpio);
+ gpio_direction_output(plat->power_gpio, 1);
+ }
+
+ if (gpio_is_valid(plat->cd_gpio)) {
+ rc = gpio_request(plat->cd_gpio, "sdhci_cd");
+ if (rc) {
+ dev_err(mmc_dev(host->mmc),
+ "failed to allocate cd gpio\n");
+ goto out_power;
+ }
+ tegra_gpio_enable(plat->cd_gpio);
+ gpio_direction_input(plat->cd_gpio);
+
+ rc = request_irq(gpio_to_irq(plat->cd_gpio), carddetect_irq,
+ IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
+ mmc_hostname(host->mmc), host);
+
+ if (rc) {
+ dev_err(mmc_dev(host->mmc), "request irq error\n");
+ goto out_cd;
+ }
+
+ }
+
+ if (gpio_is_valid(plat->wp_gpio)) {
+ rc = gpio_request(plat->wp_gpio, "sdhci_wp");
+ if (rc) {
+ dev_err(mmc_dev(host->mmc),
+ "failed to allocate wp gpio\n");
+ goto out_cd;
+ }
+ tegra_gpio_enable(plat->wp_gpio);
+ gpio_direction_input(plat->wp_gpio);
+ }
+
+ clk = clk_get(mmc_dev(host->mmc), NULL);
+ if (IS_ERR(clk)) {
+ dev_err(mmc_dev(host->mmc), "clk err\n");
+ rc = PTR_ERR(clk);
+ goto out_wp;
+ }
+ clk_enable(clk);
+ pltfm_host->clk = clk;
+
+ if (plat->is_8bit)
+ host->mmc->caps |= MMC_CAP_8_BIT_DATA;
+
+ return 0;
+
+out_wp:
+ if (gpio_is_valid(plat->wp_gpio)) {
+ tegra_gpio_disable(plat->wp_gpio);
+ gpio_free(plat->wp_gpio);
+ }
+
+out_cd:
+ if (gpio_is_valid(plat->cd_gpio)) {
+ tegra_gpio_disable(plat->cd_gpio);
+ gpio_free(plat->cd_gpio);
+ }
+
+out_power:
+ if (gpio_is_valid(plat->power_gpio)) {
+ tegra_gpio_disable(plat->power_gpio);
+ gpio_free(plat->power_gpio);
+ }
+
+out:
+ return rc;
+}
+
+static void tegra_sdhci_pltfm_exit(struct sdhci_host *host)
+{
+ struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+ struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc));
+ struct tegra_sdhci_platform_data *plat;
+
+ plat = pdev->dev.platform_data;
+
+ if (gpio_is_valid(plat->wp_gpio)) {
+ tegra_gpio_disable(plat->wp_gpio);
+ gpio_free(plat->wp_gpio);
+ }
+
+ if (gpio_is_valid(plat->cd_gpio)) {
+ tegra_gpio_disable(plat->cd_gpio);
+ gpio_free(plat->cd_gpio);
+ }
+
+ if (gpio_is_valid(plat->power_gpio)) {
+ tegra_gpio_disable(plat->power_gpio);
+ gpio_free(plat->power_gpio);
+ }
+
+ clk_disable(pltfm_host->clk);
+ clk_put(pltfm_host->clk);
+}
+
+static struct sdhci_ops tegra_sdhci_ops = {
+ .get_ro = tegra_sdhci_get_ro,
+ .read_l = tegra_sdhci_readl,
+ .read_w = tegra_sdhci_readw,
+ .write_l = tegra_sdhci_writel,
+ .platform_8bit_width = tegra_sdhci_8bit,
+};
+
+struct sdhci_pltfm_data sdhci_tegra_pdata = {
+ .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
+ SDHCI_QUIRK_SINGLE_POWER_WRITE |
+ SDHCI_QUIRK_NO_HISPD_BIT |
+ SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC,
+ .ops = &tegra_sdhci_ops,
+ .init = tegra_sdhci_pltfm_init,
+ .exit = tegra_sdhci_pltfm_exit,
+};
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index a25db42..9e15f41 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -23,6 +23,7 @@
#include <linux/leds.h>
+#include <linux/mmc/mmc.h>
#include <linux/mmc/host.h>
#include "sdhci.h"
@@ -77,8 +78,11 @@ static void sdhci_dumpregs(struct sdhci_host *host)
printk(KERN_DEBUG DRIVER_NAME ": AC12 err: 0x%08x | Slot int: 0x%08x\n",
sdhci_readw(host, SDHCI_ACMD12_ERR),
sdhci_readw(host, SDHCI_SLOT_INT_STATUS));
- printk(KERN_DEBUG DRIVER_NAME ": Caps: 0x%08x | Max curr: 0x%08x\n",
+ printk(KERN_DEBUG DRIVER_NAME ": Caps: 0x%08x | Caps_1: 0x%08x\n",
sdhci_readl(host, SDHCI_CAPABILITIES),
+ sdhci_readl(host, SDHCI_CAPABILITIES_1));
+ printk(KERN_DEBUG DRIVER_NAME ": Cmd: 0x%08x | Max curr: 0x%08x\n",
+ sdhci_readw(host, SDHCI_COMMAND),
sdhci_readl(host, SDHCI_MAX_CURRENT));
if (host->flags & SDHCI_USE_ADMA)
@@ -1518,7 +1522,11 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
if (intmask & SDHCI_INT_DATA_TIMEOUT)
host->data->error = -ETIMEDOUT;
- else if (intmask & (SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_END_BIT))
+ else if (intmask & SDHCI_INT_DATA_END_BIT)
+ host->data->error = -EILSEQ;
+ else if ((intmask & SDHCI_INT_DATA_CRC) &&
+ SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))
+ != MMC_BUS_TEST_R)
host->data->error = -EILSEQ;
else if (intmask & SDHCI_INT_ADMA_ERROR) {
printk(KERN_ERR "%s: ADMA error\n", mmc_hostname(host->mmc));
@@ -1736,7 +1744,7 @@ EXPORT_SYMBOL_GPL(sdhci_alloc_host);
int sdhci_add_host(struct sdhci_host *host)
{
struct mmc_host *mmc;
- unsigned int caps;
+ unsigned int caps, ocr_avail;
int ret;
WARN_ON(host == NULL);
@@ -1890,13 +1898,26 @@ int sdhci_add_host(struct sdhci_host *host)
mmc_card_is_removable(mmc))
mmc->caps |= MMC_CAP_NEEDS_POLL;
- mmc->ocr_avail = 0;
+ ocr_avail = 0;
if (caps & SDHCI_CAN_VDD_330)
- mmc->ocr_avail |= MMC_VDD_32_33|MMC_VDD_33_34;
+ ocr_avail |= MMC_VDD_32_33 | MMC_VDD_33_34;
if (caps & SDHCI_CAN_VDD_300)
- mmc->ocr_avail |= MMC_VDD_29_30|MMC_VDD_30_31;
+ ocr_avail |= MMC_VDD_29_30 | MMC_VDD_30_31;
if (caps & SDHCI_CAN_VDD_180)
- mmc->ocr_avail |= MMC_VDD_165_195;
+ ocr_avail |= MMC_VDD_165_195;
+
+ mmc->ocr_avail = ocr_avail;
+ mmc->ocr_avail_sdio = ocr_avail;
+ if (host->ocr_avail_sdio)
+ mmc->ocr_avail_sdio &= host->ocr_avail_sdio;
+ mmc->ocr_avail_sd = ocr_avail;
+ if (host->ocr_avail_sd)
+ mmc->ocr_avail_sd &= host->ocr_avail_sd;
+ else /* normal SD controllers don't support 1.8V */
+ mmc->ocr_avail_sd &= ~MMC_VDD_165_195;
+ mmc->ocr_avail_mmc = ocr_avail;
+ if (host->ocr_avail_mmc)
+ mmc->ocr_avail_mmc &= host->ocr_avail_mmc;
if (mmc->ocr_avail == 0) {
printk(KERN_ERR "%s: Hardware doesn't report any "
@@ -1928,10 +1949,14 @@ int sdhci_add_host(struct sdhci_host *host)
* of bytes. When doing hardware scatter/gather, each entry cannot
* be larger than 64 KiB though.
*/
- if (host->flags & SDHCI_USE_ADMA)
- mmc->max_seg_size = 65536;
- else
+ if (host->flags & SDHCI_USE_ADMA) {
+ if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC)
+ mmc->max_seg_size = 65535;
+ else
+ mmc->max_seg_size = 65536;
+ } else {
mmc->max_seg_size = mmc->max_req_size;
+ }
/*
* Maximum block size. This varies from controller to controller and
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index e42d7f0..6e0969e 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -52,6 +52,7 @@
#define SDHCI_CMD_RESP_SHORT_BUSY 0x03
#define SDHCI_MAKE_CMD(c, f) (((c & 0xff) << 8) | (f & 0xff))
+#define SDHCI_GET_CMD(c) ((c>>8) & 0x3f)
#define SDHCI_RESPONSE 0x10
@@ -165,7 +166,7 @@
#define SDHCI_CAN_VDD_180 0x04000000
#define SDHCI_CAN_64BIT 0x10000000
-/* 44-47 reserved for more caps */
+#define SDHCI_CAPABILITIES_1 0x44
#define SDHCI_MAX_CURRENT 0x48
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index e7765a8..e3c6ef2 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -25,16 +25,261 @@
* double buffer support
*
*/
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/device.h>
+
#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/dmaengine.h>
-#include <linux/mmc/host.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
#include <linux/mfd/core.h>
#include <linux/mfd/tmio.h>
+#include <linux/mmc/host.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+#define CTL_SD_CMD 0x00
+#define CTL_ARG_REG 0x04
+#define CTL_STOP_INTERNAL_ACTION 0x08
+#define CTL_XFER_BLK_COUNT 0xa
+#define CTL_RESPONSE 0x0c
+#define CTL_STATUS 0x1c
+#define CTL_IRQ_MASK 0x20
+#define CTL_SD_CARD_CLK_CTL 0x24
+#define CTL_SD_XFER_LEN 0x26
+#define CTL_SD_MEM_CARD_OPT 0x28
+#define CTL_SD_ERROR_DETAIL_STATUS 0x2c
+#define CTL_SD_DATA_PORT 0x30
+#define CTL_TRANSACTION_CTL 0x34
+#define CTL_SDIO_STATUS 0x36
+#define CTL_SDIO_IRQ_MASK 0x38
+#define CTL_RESET_SD 0xe0
+#define CTL_SDIO_REGS 0x100
+#define CTL_CLK_AND_WAIT_CTL 0x138
+#define CTL_RESET_SDIO 0x1e0
+
+/* Definitions for values the CTRL_STATUS register can take. */
+#define TMIO_STAT_CMDRESPEND 0x00000001
+#define TMIO_STAT_DATAEND 0x00000004
+#define TMIO_STAT_CARD_REMOVE 0x00000008
+#define TMIO_STAT_CARD_INSERT 0x00000010
+#define TMIO_STAT_SIGSTATE 0x00000020
+#define TMIO_STAT_WRPROTECT 0x00000080
+#define TMIO_STAT_CARD_REMOVE_A 0x00000100
+#define TMIO_STAT_CARD_INSERT_A 0x00000200
+#define TMIO_STAT_SIGSTATE_A 0x00000400
+#define TMIO_STAT_CMD_IDX_ERR 0x00010000
+#define TMIO_STAT_CRCFAIL 0x00020000
+#define TMIO_STAT_STOPBIT_ERR 0x00040000
+#define TMIO_STAT_DATATIMEOUT 0x00080000
+#define TMIO_STAT_RXOVERFLOW 0x00100000
+#define TMIO_STAT_TXUNDERRUN 0x00200000
+#define TMIO_STAT_CMDTIMEOUT 0x00400000
+#define TMIO_STAT_RXRDY 0x01000000
+#define TMIO_STAT_TXRQ 0x02000000
+#define TMIO_STAT_ILL_FUNC 0x20000000
+#define TMIO_STAT_CMD_BUSY 0x40000000
+#define TMIO_STAT_ILL_ACCESS 0x80000000
+
+/* Definitions for values the CTRL_SDIO_STATUS register can take. */
+#define TMIO_SDIO_STAT_IOIRQ 0x0001
+#define TMIO_SDIO_STAT_EXPUB52 0x4000
+#define TMIO_SDIO_STAT_EXWT 0x8000
+#define TMIO_SDIO_MASK_ALL 0xc007
+
+/* Define some IRQ masks */
+/* This is the mask used at reset by the chip */
+#define TMIO_MASK_ALL 0x837f031d
+#define TMIO_MASK_READOP (TMIO_STAT_RXRDY | TMIO_STAT_DATAEND)
+#define TMIO_MASK_WRITEOP (TMIO_STAT_TXRQ | TMIO_STAT_DATAEND)
+#define TMIO_MASK_CMD (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT | \
+ TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT)
+#define TMIO_MASK_IRQ (TMIO_MASK_READOP | TMIO_MASK_WRITEOP | TMIO_MASK_CMD)
+
+#define enable_mmc_irqs(host, i) \
+ do { \
+ u32 mask;\
+ mask = sd_ctrl_read32((host), CTL_IRQ_MASK); \
+ mask &= ~((i) & TMIO_MASK_IRQ); \
+ sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
+ } while (0)
+
+#define disable_mmc_irqs(host, i) \
+ do { \
+ u32 mask;\
+ mask = sd_ctrl_read32((host), CTL_IRQ_MASK); \
+ mask |= ((i) & TMIO_MASK_IRQ); \
+ sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
+ } while (0)
+
+#define ack_mmc_irqs(host, i) \
+ do { \
+ sd_ctrl_write32((host), CTL_STATUS, ~(i)); \
+ } while (0)
+
+/* This is arbitrary, just noone needed any higher alignment yet */
+#define MAX_ALIGN 4
+
+struct tmio_mmc_host {
+ void __iomem *ctl;
+ unsigned long bus_shift;
+ struct mmc_command *cmd;
+ struct mmc_request *mrq;
+ struct mmc_data *data;
+ struct mmc_host *mmc;
+ int irq;
+ unsigned int sdio_irq_enabled;
+
+ /* Callbacks for clock / power control */
+ void (*set_pwr)(struct platform_device *host, int state);
+ void (*set_clk_div)(struct platform_device *host, int state);
+
+ /* pio related stuff */
+ struct scatterlist *sg_ptr;
+ struct scatterlist *sg_orig;
+ unsigned int sg_len;
+ unsigned int sg_off;
+
+ struct platform_device *pdev;
+
+ /* DMA support */
+ struct dma_chan *chan_rx;
+ struct dma_chan *chan_tx;
+ struct tasklet_struct dma_complete;
+ struct tasklet_struct dma_issue;
+#ifdef CONFIG_TMIO_MMC_DMA
+ unsigned int dma_sglen;
+ u8 bounce_buf[PAGE_CACHE_SIZE] __attribute__((aligned(MAX_ALIGN)));
+ struct scatterlist bounce_sg;
+#endif
+
+ /* Track lost interrupts */
+ struct delayed_work delayed_reset_work;
+ spinlock_t lock;
+ unsigned long last_req_ts;
+};
+
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host);
+
+static u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
+{
+ return readw(host->ctl + (addr << host->bus_shift));
+}
+
+static void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
+ u16 *buf, int count)
+{
+ readsw(host->ctl + (addr << host->bus_shift), buf, count);
+}
-#include "tmio_mmc.h"
+static u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
+{
+ return readw(host->ctl + (addr << host->bus_shift)) |
+ readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
+}
+
+static void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val)
+{
+ writew(val, host->ctl + (addr << host->bus_shift));
+}
+
+static void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
+ u16 *buf, int count)
+{
+ writesw(host->ctl + (addr << host->bus_shift), buf, count);
+}
+
+static void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val)
+{
+ writew(val, host->ctl + (addr << host->bus_shift));
+ writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
+}
+
+static void tmio_mmc_init_sg(struct tmio_mmc_host *host, struct mmc_data *data)
+{
+ host->sg_len = data->sg_len;
+ host->sg_ptr = data->sg;
+ host->sg_orig = data->sg;
+ host->sg_off = 0;
+}
+
+static int tmio_mmc_next_sg(struct tmio_mmc_host *host)
+{
+ host->sg_ptr = sg_next(host->sg_ptr);
+ host->sg_off = 0;
+ return --host->sg_len;
+}
+
+static char *tmio_mmc_kmap_atomic(struct scatterlist *sg, unsigned long *flags)
+{
+ local_irq_save(*flags);
+ return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
+}
+
+static void tmio_mmc_kunmap_atomic(void *virt, unsigned long *flags)
+{
+ kunmap_atomic(virt, KM_BIO_SRC_IRQ);
+ local_irq_restore(*flags);
+}
+
+#ifdef CONFIG_MMC_DEBUG
+
+#define STATUS_TO_TEXT(a) \
+ do { \
+ if (status & TMIO_STAT_##a) \
+ printk(#a); \
+ } while (0)
+
+void pr_debug_status(u32 status)
+{
+ printk(KERN_DEBUG "status: %08x = ", status);
+ STATUS_TO_TEXT(CARD_REMOVE);
+ STATUS_TO_TEXT(CARD_INSERT);
+ STATUS_TO_TEXT(SIGSTATE);
+ STATUS_TO_TEXT(WRPROTECT);
+ STATUS_TO_TEXT(CARD_REMOVE_A);
+ STATUS_TO_TEXT(CARD_INSERT_A);
+ STATUS_TO_TEXT(SIGSTATE_A);
+ STATUS_TO_TEXT(CMD_IDX_ERR);
+ STATUS_TO_TEXT(STOPBIT_ERR);
+ STATUS_TO_TEXT(ILL_FUNC);
+ STATUS_TO_TEXT(CMD_BUSY);
+ STATUS_TO_TEXT(CMDRESPEND);
+ STATUS_TO_TEXT(DATAEND);
+ STATUS_TO_TEXT(CRCFAIL);
+ STATUS_TO_TEXT(DATATIMEOUT);
+ STATUS_TO_TEXT(CMDTIMEOUT);
+ STATUS_TO_TEXT(RXOVERFLOW);
+ STATUS_TO_TEXT(TXUNDERRUN);
+ STATUS_TO_TEXT(RXRDY);
+ STATUS_TO_TEXT(TXRQ);
+ STATUS_TO_TEXT(ILL_ACCESS);
+ printk("\n");
+}
+
+#else
+#define pr_debug_status(s) do { } while (0)
+#endif
+
+static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+ struct tmio_mmc_host *host = mmc_priv(mmc);
+
+ if (enable) {
+ host->sdio_irq_enabled = 1;
+ sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001);
+ sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK,
+ (TMIO_SDIO_MASK_ALL & ~TMIO_SDIO_STAT_IOIRQ));
+ } else {
+ sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, TMIO_SDIO_MASK_ALL);
+ sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0000);
+ host->sdio_irq_enabled = 0;
+ }
+}
static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
{
@@ -55,8 +300,23 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, int new_clock)
static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
{
+ struct mfd_cell *cell = host->pdev->dev.platform_data;
+ struct tmio_mmc_data *pdata = cell->driver_data;
+
+ /*
+ * Testing on sh-mobile showed that SDIO IRQs are unmasked when
+ * CTL_CLK_AND_WAIT_CTL gets written, so we have to disable the
+ * device IRQ here and restore the SDIO IRQ mask before
+ * re-enabling the device IRQ.
+ */
+ if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+ disable_irq(host->irq);
sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0000);
msleep(10);
+ if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
+ tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
+ enable_irq(host->irq);
+ }
sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, ~0x0100 &
sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
msleep(10);
@@ -64,11 +324,21 @@ static void tmio_mmc_clk_stop(struct tmio_mmc_host *host)
static void tmio_mmc_clk_start(struct tmio_mmc_host *host)
{
+ struct mfd_cell *cell = host->pdev->dev.platform_data;
+ struct tmio_mmc_data *pdata = cell->driver_data;
+
sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, 0x0100 |
sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL));
msleep(10);
+ /* see comment in tmio_mmc_clk_stop above */
+ if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+ disable_irq(host->irq);
sd_ctrl_write16(host, CTL_CLK_AND_WAIT_CTL, 0x0100);
msleep(10);
+ if (pdata->flags & TMIO_MMC_SDIO_IRQ) {
+ tmio_mmc_enable_sdio_irq(host->mmc, host->sdio_irq_enabled);
+ enable_irq(host->irq);
+ }
}
static void reset(struct tmio_mmc_host *host)
@@ -82,15 +352,60 @@ static void reset(struct tmio_mmc_host *host)
msleep(10);
}
+static void tmio_mmc_reset_work(struct work_struct *work)
+{
+ struct tmio_mmc_host *host = container_of(work, struct tmio_mmc_host,
+ delayed_reset_work.work);
+ struct mmc_request *mrq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&host->lock, flags);
+ mrq = host->mrq;
+
+ /* request already finished */
+ if (!mrq
+ || time_is_after_jiffies(host->last_req_ts +
+ msecs_to_jiffies(2000))) {
+ spin_unlock_irqrestore(&host->lock, flags);
+ return;
+ }
+
+ dev_warn(&host->pdev->dev,
+ "timeout waiting for hardware interrupt (CMD%u)\n",
+ mrq->cmd->opcode);
+
+ if (host->data)
+ host->data->error = -ETIMEDOUT;
+ else if (host->cmd)
+ host->cmd->error = -ETIMEDOUT;
+ else
+ mrq->cmd->error = -ETIMEDOUT;
+
+ host->cmd = NULL;
+ host->data = NULL;
+ host->mrq = NULL;
+
+ spin_unlock_irqrestore(&host->lock, flags);
+
+ reset(host);
+
+ mmc_request_done(host->mmc, mrq);
+}
+
static void
tmio_mmc_finish_request(struct tmio_mmc_host *host)
{
struct mmc_request *mrq = host->mrq;
+ if (!mrq)
+ return;
+
host->mrq = NULL;
host->cmd = NULL;
host->data = NULL;
+ cancel_delayed_work(&host->delayed_reset_work);
+
mmc_request_done(host->mmc, mrq);
}
@@ -200,6 +515,7 @@ static void tmio_mmc_pio_irq(struct tmio_mmc_host *host)
return;
}
+/* needs to be called with host->lock held */
static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
{
struct mmc_data *data = host->data;
@@ -233,6 +549,8 @@ static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
if (data->flags & MMC_DATA_READ) {
if (!host->chan_rx)
disable_mmc_irqs(host, TMIO_MASK_READOP);
+ else
+ tmio_check_bounce_buffer(host);
dev_dbg(&host->pdev->dev, "Complete Rx request %p\n",
host->mrq);
} else {
@@ -254,10 +572,12 @@ static void tmio_mmc_do_data_irq(struct tmio_mmc_host *host)
static void tmio_mmc_data_irq(struct tmio_mmc_host *host)
{
- struct mmc_data *data = host->data;
+ struct mmc_data *data;
+ spin_lock(&host->lock);
+ data = host->data;
if (!data)
- return;
+ goto out;
if (host->chan_tx && (data->flags & MMC_DATA_WRITE)) {
/*
@@ -278,6 +598,8 @@ static void tmio_mmc_data_irq(struct tmio_mmc_host *host)
} else {
tmio_mmc_do_data_irq(host);
}
+out:
+ spin_unlock(&host->lock);
}
static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
@@ -286,9 +608,11 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
struct mmc_command *cmd = host->cmd;
int i, addr;
+ spin_lock(&host->lock);
+
if (!host->cmd) {
pr_debug("Spurious CMD irq\n");
- return;
+ goto out;
}
host->cmd = NULL;
@@ -324,8 +648,7 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
if (!host->chan_rx)
enable_mmc_irqs(host, TMIO_MASK_READOP);
} else {
- struct dma_chan *chan = host->chan_tx;
- if (!chan)
+ if (!host->chan_tx)
enable_mmc_irqs(host, TMIO_MASK_WRITEOP);
else
tasklet_schedule(&host->dma_issue);
@@ -334,13 +657,19 @@ static void tmio_mmc_cmd_irq(struct tmio_mmc_host *host,
tmio_mmc_finish_request(host);
}
+out:
+ spin_unlock(&host->lock);
+
return;
}
static irqreturn_t tmio_mmc_irq(int irq, void *devid)
{
struct tmio_mmc_host *host = devid;
+ struct mfd_cell *cell = host->pdev->dev.platform_data;
+ struct tmio_mmc_data *pdata = cell->driver_data;
unsigned int ireg, irq_mask, status;
+ unsigned int sdio_ireg, sdio_irq_mask, sdio_status;
pr_debug("MMC IRQ begin\n");
@@ -348,6 +677,29 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
irq_mask = sd_ctrl_read32(host, CTL_IRQ_MASK);
ireg = status & TMIO_MASK_IRQ & ~irq_mask;
+ sdio_ireg = 0;
+ if (!ireg && pdata->flags & TMIO_MMC_SDIO_IRQ) {
+ sdio_status = sd_ctrl_read16(host, CTL_SDIO_STATUS);
+ sdio_irq_mask = sd_ctrl_read16(host, CTL_SDIO_IRQ_MASK);
+ sdio_ireg = sdio_status & TMIO_SDIO_MASK_ALL & ~sdio_irq_mask;
+
+ sd_ctrl_write16(host, CTL_SDIO_STATUS, sdio_status & ~TMIO_SDIO_MASK_ALL);
+
+ if (sdio_ireg && !host->sdio_irq_enabled) {
+ pr_warning("tmio_mmc: Spurious SDIO IRQ, disabling! 0x%04x 0x%04x 0x%04x\n",
+ sdio_status, sdio_irq_mask, sdio_ireg);
+ tmio_mmc_enable_sdio_irq(host->mmc, 0);
+ goto out;
+ }
+
+ if (host->mmc->caps & MMC_CAP_SDIO_IRQ &&
+ sdio_ireg & TMIO_SDIO_STAT_IOIRQ)
+ mmc_signal_sdio_irq(host->mmc);
+
+ if (sdio_ireg)
+ goto out;
+ }
+
pr_debug_status(status);
pr_debug_status(ireg);
@@ -375,8 +727,10 @@ static irqreturn_t tmio_mmc_irq(int irq, void *devid)
*/
/* Command completion */
- if (ireg & TMIO_MASK_CMD) {
- ack_mmc_irqs(host, TMIO_MASK_CMD);
+ if (ireg & (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT)) {
+ ack_mmc_irqs(host,
+ TMIO_STAT_CMDRESPEND |
+ TMIO_STAT_CMDTIMEOUT);
tmio_mmc_cmd_irq(host, status);
}
@@ -407,6 +761,16 @@ out:
}
#ifdef CONFIG_TMIO_MMC_DMA
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
+{
+ if (host->sg_ptr == &host->bounce_sg) {
+ unsigned long flags;
+ void *sg_vaddr = tmio_mmc_kmap_atomic(host->sg_orig, &flags);
+ memcpy(sg_vaddr, host->bounce_buf, host->bounce_sg.length);
+ tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
+ }
+}
+
static void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
{
#if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE)
@@ -427,12 +791,39 @@ static void tmio_dma_complete(void *arg)
enable_mmc_irqs(host, TMIO_STAT_DATAEND);
}
-static int tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
+static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
{
- struct scatterlist *sg = host->sg_ptr;
+ struct scatterlist *sg = host->sg_ptr, *sg_tmp;
struct dma_async_tx_descriptor *desc = NULL;
struct dma_chan *chan = host->chan_rx;
- int ret;
+ struct mfd_cell *cell = host->pdev->dev.platform_data;
+ struct tmio_mmc_data *pdata = cell->driver_data;
+ dma_cookie_t cookie;
+ int ret, i;
+ bool aligned = true, multiple = true;
+ unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+
+ for_each_sg(sg, sg_tmp, host->sg_len, i) {
+ if (sg_tmp->offset & align)
+ aligned = false;
+ if (sg_tmp->length & align) {
+ multiple = false;
+ break;
+ }
+ }
+
+ if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+ align >= MAX_ALIGN)) || !multiple) {
+ ret = -EINVAL;
+ goto pio;
+ }
+
+ /* The only sg element can be unaligned, use our bounce buffer then */
+ if (!aligned) {
+ sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
+ host->sg_ptr = &host->bounce_sg;
+ sg = host->sg_ptr;
+ }
ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_FROM_DEVICE);
if (ret > 0) {
@@ -442,21 +833,21 @@ static int tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
}
if (desc) {
- host->desc = desc;
desc->callback = tmio_dma_complete;
desc->callback_param = host;
- host->cookie = desc->tx_submit(desc);
- if (host->cookie < 0) {
- host->desc = NULL;
- ret = host->cookie;
+ cookie = desc->tx_submit(desc);
+ if (cookie < 0) {
+ desc = NULL;
+ ret = cookie;
} else {
chan->device->device_issue_pending(chan);
}
}
dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
- __func__, host->sg_len, ret, host->cookie, host->mrq);
+ __func__, host->sg_len, ret, cookie, host->mrq);
- if (!host->desc) {
+pio:
+ if (!desc) {
/* DMA failed, fall back to PIO */
if (ret >= 0)
ret = -EIO;
@@ -471,24 +862,49 @@ static int tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
dev_warn(&host->pdev->dev,
"DMA failed: %d, falling back to PIO\n", ret);
tmio_mmc_enable_dma(host, false);
- reset(host);
- /* Fail this request, let above layers recover */
- host->mrq->cmd->error = ret;
- tmio_mmc_finish_request(host);
}
dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d, sg[%d]\n", __func__,
- desc, host->cookie, host->sg_len);
-
- return ret > 0 ? 0 : ret;
+ desc, cookie, host->sg_len);
}
-static int tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
+static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
{
- struct scatterlist *sg = host->sg_ptr;
+ struct scatterlist *sg = host->sg_ptr, *sg_tmp;
struct dma_async_tx_descriptor *desc = NULL;
struct dma_chan *chan = host->chan_tx;
- int ret;
+ struct mfd_cell *cell = host->pdev->dev.platform_data;
+ struct tmio_mmc_data *pdata = cell->driver_data;
+ dma_cookie_t cookie;
+ int ret, i;
+ bool aligned = true, multiple = true;
+ unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+
+ for_each_sg(sg, sg_tmp, host->sg_len, i) {
+ if (sg_tmp->offset & align)
+ aligned = false;
+ if (sg_tmp->length & align) {
+ multiple = false;
+ break;
+ }
+ }
+
+ if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+ align >= MAX_ALIGN)) || !multiple) {
+ ret = -EINVAL;
+ goto pio;
+ }
+
+ /* The only sg element can be unaligned, use our bounce buffer then */
+ if (!aligned) {
+ unsigned long flags;
+ void *sg_vaddr = tmio_mmc_kmap_atomic(sg, &flags);
+ sg_init_one(&host->bounce_sg, host->bounce_buf, sg->length);
+ memcpy(host->bounce_buf, sg_vaddr, host->bounce_sg.length);
+ tmio_mmc_kunmap_atomic(sg_vaddr, &flags);
+ host->sg_ptr = &host->bounce_sg;
+ sg = host->sg_ptr;
+ }
ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, DMA_TO_DEVICE);
if (ret > 0) {
@@ -498,19 +914,19 @@ static int tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
}
if (desc) {
- host->desc = desc;
desc->callback = tmio_dma_complete;
desc->callback_param = host;
- host->cookie = desc->tx_submit(desc);
- if (host->cookie < 0) {
- host->desc = NULL;
- ret = host->cookie;
+ cookie = desc->tx_submit(desc);
+ if (cookie < 0) {
+ desc = NULL;
+ ret = cookie;
}
}
dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
- __func__, host->sg_len, ret, host->cookie, host->mrq);
+ __func__, host->sg_len, ret, cookie, host->mrq);
- if (!host->desc) {
+pio:
+ if (!desc) {
/* DMA failed, fall back to PIO */
if (ret >= 0)
ret = -EIO;
@@ -525,30 +941,22 @@ static int tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
dev_warn(&host->pdev->dev,
"DMA failed: %d, falling back to PIO\n", ret);
tmio_mmc_enable_dma(host, false);
- reset(host);
- /* Fail this request, let above layers recover */
- host->mrq->cmd->error = ret;
- tmio_mmc_finish_request(host);
}
dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d\n", __func__,
- desc, host->cookie);
-
- return ret > 0 ? 0 : ret;
+ desc, cookie);
}
-static int tmio_mmc_start_dma(struct tmio_mmc_host *host,
+static void tmio_mmc_start_dma(struct tmio_mmc_host *host,
struct mmc_data *data)
{
if (data->flags & MMC_DATA_READ) {
if (host->chan_rx)
- return tmio_mmc_start_dma_rx(host);
+ tmio_mmc_start_dma_rx(host);
} else {
if (host->chan_tx)
- return tmio_mmc_start_dma_tx(host);
+ tmio_mmc_start_dma_tx(host);
}
-
- return 0;
}
static void tmio_issue_tasklet_fn(unsigned long priv)
@@ -562,6 +970,12 @@ static void tmio_issue_tasklet_fn(unsigned long priv)
static void tmio_tasklet_fn(unsigned long arg)
{
struct tmio_mmc_host *host = (struct tmio_mmc_host *)arg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&host->lock, flags);
+
+ if (!host->data)
+ goto out;
if (host->data->flags & MMC_DATA_READ)
dma_unmap_sg(&host->pdev->dev, host->sg_ptr, host->dma_sglen,
@@ -571,6 +985,8 @@ static void tmio_tasklet_fn(unsigned long arg)
DMA_TO_DEVICE);
tmio_mmc_do_data_irq(host);
+out:
+ spin_unlock_irqrestore(&host->lock, flags);
}
/* It might be necessary to make filter MFD specific */
@@ -584,9 +1000,6 @@ static bool tmio_mmc_filter(struct dma_chan *chan, void *arg)
static void tmio_mmc_request_dma(struct tmio_mmc_host *host,
struct tmio_mmc_data *pdata)
{
- host->cookie = -EINVAL;
- host->desc = NULL;
-
/* We can only either use DMA for both Tx and Rx or not use it at all */
if (pdata->dma) {
dma_cap_mask_t mask;
@@ -632,15 +1045,15 @@ static void tmio_mmc_release_dma(struct tmio_mmc_host *host)
host->chan_rx = NULL;
dma_release_channel(chan);
}
-
- host->cookie = -EINVAL;
- host->desc = NULL;
}
#else
-static int tmio_mmc_start_dma(struct tmio_mmc_host *host,
+static void tmio_check_bounce_buffer(struct tmio_mmc_host *host)
+{
+}
+
+static void tmio_mmc_start_dma(struct tmio_mmc_host *host,
struct mmc_data *data)
{
- return 0;
}
static void tmio_mmc_request_dma(struct tmio_mmc_host *host,
@@ -682,7 +1095,9 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host,
sd_ctrl_write16(host, CTL_SD_XFER_LEN, data->blksz);
sd_ctrl_write16(host, CTL_XFER_BLK_COUNT, data->blocks);
- return tmio_mmc_start_dma(host, data);
+ tmio_mmc_start_dma(host, data);
+
+ return 0;
}
/* Process requests from the MMC layer */
@@ -694,6 +1109,8 @@ static void tmio_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
if (host->mrq)
pr_debug("request not null\n");
+ host->last_req_ts = jiffies;
+ wmb();
host->mrq = mrq;
if (mrq->data) {
@@ -703,10 +1120,14 @@ static void tmio_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
}
ret = tmio_mmc_start_command(host, mrq->cmd);
- if (!ret)
+ if (!ret) {
+ schedule_delayed_work(&host->delayed_reset_work,
+ msecs_to_jiffies(2000));
return;
+ }
fail:
+ host->mrq = NULL;
mrq->cmd->error = ret;
mmc_request_done(mmc, mrq);
}
@@ -780,6 +1201,7 @@ static const struct mmc_host_ops tmio_mmc_ops = {
.set_ios = tmio_mmc_set_ios,
.get_ro = tmio_mmc_get_ro,
.get_cd = tmio_mmc_get_cd,
+ .enable_sdio_irq = tmio_mmc_enable_sdio_irq,
};
#ifdef CONFIG_PM
@@ -864,10 +1286,15 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
goto host_free;
mmc->ops = &tmio_mmc_ops;
- mmc->caps = MMC_CAP_4_BIT_DATA;
- mmc->caps |= pdata->capabilities;
+ mmc->caps = MMC_CAP_4_BIT_DATA | pdata->capabilities;
mmc->f_max = pdata->hclk;
mmc->f_min = mmc->f_max / 512;
+ mmc->max_segs = 32;
+ mmc->max_blk_size = 512;
+ mmc->max_blk_count = (PAGE_CACHE_SIZE / mmc->max_blk_size) *
+ mmc->max_segs;
+ mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
+ mmc->max_seg_size = mmc->max_req_size;
if (pdata->ocr_mask)
mmc->ocr_avail = pdata->ocr_mask;
else
@@ -890,12 +1317,19 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev)
goto cell_disable;
disable_mmc_irqs(host, TMIO_MASK_ALL);
+ if (pdata->flags & TMIO_MMC_SDIO_IRQ)
+ tmio_mmc_enable_sdio_irq(mmc, 0);
ret = request_irq(host->irq, tmio_mmc_irq, IRQF_DISABLED |
IRQF_TRIGGER_FALLING, dev_name(&dev->dev), host);
if (ret)
goto cell_disable;
+ spin_lock_init(&host->lock);
+
+ /* Init delayed work for request timeouts */
+ INIT_DELAYED_WORK(&host->delayed_reset_work, tmio_mmc_reset_work);
+
/* See if we also get DMA */
tmio_mmc_request_dma(host, pdata);
@@ -934,6 +1368,7 @@ static int __devexit tmio_mmc_remove(struct platform_device *dev)
if (mmc) {
struct tmio_mmc_host *host = mmc_priv(mmc);
mmc_remove_host(mmc);
+ cancel_delayed_work_sync(&host->delayed_reset_work);
tmio_mmc_release_dma(host);
free_irq(host->irq, host);
if (cell->disable)
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
deleted file mode 100644
index 0fedc78..0000000
--- a/drivers/mmc/host/tmio_mmc.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Definitons for use with the tmio_mmc.c
- *
- * (c) 2004 Ian Molton <spyro@f2s.com>
- * (c) 2007 Ian Molton <spyro@f2s.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/interrupt.h>
-#include <linux/dmaengine.h>
-
-#define CTL_SD_CMD 0x00
-#define CTL_ARG_REG 0x04
-#define CTL_STOP_INTERNAL_ACTION 0x08
-#define CTL_XFER_BLK_COUNT 0xa
-#define CTL_RESPONSE 0x0c
-#define CTL_STATUS 0x1c
-#define CTL_IRQ_MASK 0x20
-#define CTL_SD_CARD_CLK_CTL 0x24
-#define CTL_SD_XFER_LEN 0x26
-#define CTL_SD_MEM_CARD_OPT 0x28
-#define CTL_SD_ERROR_DETAIL_STATUS 0x2c
-#define CTL_SD_DATA_PORT 0x30
-#define CTL_TRANSACTION_CTL 0x34
-#define CTL_RESET_SD 0xe0
-#define CTL_SDIO_REGS 0x100
-#define CTL_CLK_AND_WAIT_CTL 0x138
-#define CTL_RESET_SDIO 0x1e0
-
-/* Definitions for values the CTRL_STATUS register can take. */
-#define TMIO_STAT_CMDRESPEND 0x00000001
-#define TMIO_STAT_DATAEND 0x00000004
-#define TMIO_STAT_CARD_REMOVE 0x00000008
-#define TMIO_STAT_CARD_INSERT 0x00000010
-#define TMIO_STAT_SIGSTATE 0x00000020
-#define TMIO_STAT_WRPROTECT 0x00000080
-#define TMIO_STAT_CARD_REMOVE_A 0x00000100
-#define TMIO_STAT_CARD_INSERT_A 0x00000200
-#define TMIO_STAT_SIGSTATE_A 0x00000400
-#define TMIO_STAT_CMD_IDX_ERR 0x00010000
-#define TMIO_STAT_CRCFAIL 0x00020000
-#define TMIO_STAT_STOPBIT_ERR 0x00040000
-#define TMIO_STAT_DATATIMEOUT 0x00080000
-#define TMIO_STAT_RXOVERFLOW 0x00100000
-#define TMIO_STAT_TXUNDERRUN 0x00200000
-#define TMIO_STAT_CMDTIMEOUT 0x00400000
-#define TMIO_STAT_RXRDY 0x01000000
-#define TMIO_STAT_TXRQ 0x02000000
-#define TMIO_STAT_ILL_FUNC 0x20000000
-#define TMIO_STAT_CMD_BUSY 0x40000000
-#define TMIO_STAT_ILL_ACCESS 0x80000000
-
-/* Define some IRQ masks */
-/* This is the mask used at reset by the chip */
-#define TMIO_MASK_ALL 0x837f031d
-#define TMIO_MASK_READOP (TMIO_STAT_RXRDY | TMIO_STAT_DATAEND)
-#define TMIO_MASK_WRITEOP (TMIO_STAT_TXRQ | TMIO_STAT_DATAEND)
-#define TMIO_MASK_CMD (TMIO_STAT_CMDRESPEND | TMIO_STAT_CMDTIMEOUT | \
- TMIO_STAT_CARD_REMOVE | TMIO_STAT_CARD_INSERT)
-#define TMIO_MASK_IRQ (TMIO_MASK_READOP | TMIO_MASK_WRITEOP | TMIO_MASK_CMD)
-
-
-#define enable_mmc_irqs(host, i) \
- do { \
- u32 mask;\
- mask = sd_ctrl_read32((host), CTL_IRQ_MASK); \
- mask &= ~((i) & TMIO_MASK_IRQ); \
- sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
- } while (0)
-
-#define disable_mmc_irqs(host, i) \
- do { \
- u32 mask;\
- mask = sd_ctrl_read32((host), CTL_IRQ_MASK); \
- mask |= ((i) & TMIO_MASK_IRQ); \
- sd_ctrl_write32((host), CTL_IRQ_MASK, mask); \
- } while (0)
-
-#define ack_mmc_irqs(host, i) \
- do { \
- sd_ctrl_write32((host), CTL_STATUS, ~(i)); \
- } while (0)
-
-
-struct tmio_mmc_host {
- void __iomem *ctl;
- unsigned long bus_shift;
- struct mmc_command *cmd;
- struct mmc_request *mrq;
- struct mmc_data *data;
- struct mmc_host *mmc;
- int irq;
-
- /* Callbacks for clock / power control */
- void (*set_pwr)(struct platform_device *host, int state);
- void (*set_clk_div)(struct platform_device *host, int state);
-
- /* pio related stuff */
- struct scatterlist *sg_ptr;
- unsigned int sg_len;
- unsigned int sg_off;
-
- struct platform_device *pdev;
-
- /* DMA support */
- struct dma_chan *chan_rx;
- struct dma_chan *chan_tx;
- struct tasklet_struct dma_complete;
- struct tasklet_struct dma_issue;
-#ifdef CONFIG_TMIO_MMC_DMA
- struct dma_async_tx_descriptor *desc;
- unsigned int dma_sglen;
- dma_cookie_t cookie;
-#endif
-};
-
-#include <linux/io.h>
-
-static inline u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
-{
- return readw(host->ctl + (addr << host->bus_shift));
-}
-
-static inline void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
- u16 *buf, int count)
-{
- readsw(host->ctl + (addr << host->bus_shift), buf, count);
-}
-
-static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
-{
- return readw(host->ctl + (addr << host->bus_shift)) |
- readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
-}
-
-static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr,
- u16 val)
-{
- writew(val, host->ctl + (addr << host->bus_shift));
-}
-
-static inline void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
- u16 *buf, int count)
-{
- writesw(host->ctl + (addr << host->bus_shift), buf, count);
-}
-
-static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr,
- u32 val)
-{
- writew(val, host->ctl + (addr << host->bus_shift));
- writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
-}
-
-#include <linux/scatterlist.h>
-#include <linux/blkdev.h>
-
-static inline void tmio_mmc_init_sg(struct tmio_mmc_host *host,
- struct mmc_data *data)
-{
- host->sg_len = data->sg_len;
- host->sg_ptr = data->sg;
- host->sg_off = 0;
-}
-
-static inline int tmio_mmc_next_sg(struct tmio_mmc_host *host)
-{
- host->sg_ptr = sg_next(host->sg_ptr);
- host->sg_off = 0;
- return --host->sg_len;
-}
-
-static inline char *tmio_mmc_kmap_atomic(struct scatterlist *sg,
- unsigned long *flags)
-{
- local_irq_save(*flags);
- return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset;
-}
-
-static inline void tmio_mmc_kunmap_atomic(void *virt,
- unsigned long *flags)
-{
- kunmap_atomic(virt, KM_BIO_SRC_IRQ);
- local_irq_restore(*flags);
-}
-
-#ifdef CONFIG_MMC_DEBUG
-
-#define STATUS_TO_TEXT(a) \
- do { \
- if (status & TMIO_STAT_##a) \
- printk(#a); \
- } while (0)
-
-void pr_debug_status(u32 status)
-{
- printk(KERN_DEBUG "status: %08x = ", status);
- STATUS_TO_TEXT(CARD_REMOVE);
- STATUS_TO_TEXT(CARD_INSERT);
- STATUS_TO_TEXT(SIGSTATE);
- STATUS_TO_TEXT(WRPROTECT);
- STATUS_TO_TEXT(CARD_REMOVE_A);
- STATUS_TO_TEXT(CARD_INSERT_A);
- STATUS_TO_TEXT(SIGSTATE_A);
- STATUS_TO_TEXT(CMD_IDX_ERR);
- STATUS_TO_TEXT(STOPBIT_ERR);
- STATUS_TO_TEXT(ILL_FUNC);
- STATUS_TO_TEXT(CMD_BUSY);
- STATUS_TO_TEXT(CMDRESPEND);
- STATUS_TO_TEXT(DATAEND);
- STATUS_TO_TEXT(CRCFAIL);
- STATUS_TO_TEXT(DATATIMEOUT);
- STATUS_TO_TEXT(CMDTIMEOUT);
- STATUS_TO_TEXT(RXOVERFLOW);
- STATUS_TO_TEXT(TXUNDERRUN);
- STATUS_TO_TEXT(RXRDY);
- STATUS_TO_TEXT(TXRQ);
- STATUS_TO_TEXT(ILL_ACCESS);
- printk("\n");
-}
-
-#else
-#define pr_debug_status(s) do { } while (0)
-#endif
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index e6539cb..9583cbc 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -16,6 +16,7 @@
#include <linux/kdev_t.h>
#include <linux/idr.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "rtc-core.h"
@@ -152,6 +153,18 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
spin_lock_init(&rtc->irq_task_lock);
init_waitqueue_head(&rtc->irq_queue);
+ /* Init timerqueue */
+ timerqueue_init_head(&rtc->timerqueue);
+ INIT_WORK(&rtc->irqwork, rtc_timer_do_work);
+ /* Init aie timer */
+ rtc_timer_init(&rtc->aie_timer, rtc_aie_update_irq, (void *)rtc);
+ /* Init uie timer */
+ rtc_timer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, (void *)rtc);
+ /* Init pie timer */
+ hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rtc->pie_timer.function = rtc_pie_update_irq;
+ rtc->pie_enabled = 0;
+
strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
dev_set_name(&rtc->dev, "rtc%d", id);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index a0c8162..90384b9 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -14,15 +14,11 @@
#include <linux/rtc.h>
#include <linux/sched.h>
#include <linux/log2.h>
+#include <linux/workqueue.h>
-int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
+static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
{
int err;
-
- err = mutex_lock_interruptible(&rtc->ops_lock);
- if (err)
- return err;
-
if (!rtc->ops)
err = -ENODEV;
else if (!rtc->ops->read_time)
@@ -31,7 +27,18 @@ int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
memset(tm, 0, sizeof(struct rtc_time));
err = rtc->ops->read_time(rtc->dev.parent, tm);
}
+ return err;
+}
+
+int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
+{
+ int err;
+ err = mutex_lock_interruptible(&rtc->ops_lock);
+ if (err)
+ return err;
+
+ err = __rtc_read_time(rtc, tm);
mutex_unlock(&rtc->ops_lock);
return err;
}
@@ -106,188 +113,54 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
}
EXPORT_SYMBOL_GPL(rtc_set_mmss);
-static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
{
int err;
err = mutex_lock_interruptible(&rtc->ops_lock);
if (err)
return err;
-
- if (rtc->ops == NULL)
- err = -ENODEV;
- else if (!rtc->ops->read_alarm)
- err = -EINVAL;
- else {
- memset(alarm, 0, sizeof(struct rtc_wkalrm));
- err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
- }
-
+ alarm->enabled = rtc->aie_timer.enabled;
+ if (alarm->enabled)
+ alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires);
mutex_unlock(&rtc->ops_lock);
- return err;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(rtc_read_alarm);
-int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
{
+ struct rtc_time tm;
+ long now, scheduled;
int err;
- struct rtc_time before, now;
- int first_time = 1;
- unsigned long t_now, t_alm;
- enum { none, day, month, year } missing = none;
- unsigned days;
-
- /* The lower level RTC driver may return -1 in some fields,
- * creating invalid alarm->time values, for reasons like:
- *
- * - The hardware may not be capable of filling them in;
- * many alarms match only on time-of-day fields, not
- * day/month/year calendar data.
- *
- * - Some hardware uses illegal values as "wildcard" match
- * values, which non-Linux firmware (like a BIOS) may try
- * to set up as e.g. "alarm 15 minutes after each hour".
- * Linux uses only oneshot alarms.
- *
- * When we see that here, we deal with it by using values from
- * a current RTC timestamp for any missing (-1) values. The
- * RTC driver prevents "periodic alarm" modes.
- *
- * But this can be racey, because some fields of the RTC timestamp
- * may have wrapped in the interval since we read the RTC alarm,
- * which would lead to us inserting inconsistent values in place
- * of the -1 fields.
- *
- * Reading the alarm and timestamp in the reverse sequence
- * would have the same race condition, and not solve the issue.
- *
- * So, we must first read the RTC timestamp,
- * then read the RTC alarm value,
- * and then read a second RTC timestamp.
- *
- * If any fields of the second timestamp have changed
- * when compared with the first timestamp, then we know
- * our timestamp may be inconsistent with that used by
- * the low-level rtc_read_alarm_internal() function.
- *
- * So, when the two timestamps disagree, we just loop and do
- * the process again to get a fully consistent set of values.
- *
- * This could all instead be done in the lower level driver,
- * but since more than one lower level RTC implementation needs it,
- * then it's probably best best to do it here instead of there..
- */
- /* Get the "before" timestamp */
- err = rtc_read_time(rtc, &before);
- if (err < 0)
+ err = rtc_valid_tm(&alarm->time);
+ if (err)
return err;
- do {
- if (!first_time)
- memcpy(&before, &now, sizeof(struct rtc_time));
- first_time = 0;
-
- /* get the RTC alarm values, which may be incomplete */
- err = rtc_read_alarm_internal(rtc, alarm);
- if (err)
- return err;
- if (!alarm->enabled)
- return 0;
-
- /* full-function RTCs won't have such missing fields */
- if (rtc_valid_tm(&alarm->time) == 0)
- return 0;
-
- /* get the "after" timestamp, to detect wrapped fields */
- err = rtc_read_time(rtc, &now);
- if (err < 0)
- return err;
-
- /* note that tm_sec is a "don't care" value here: */
- } while ( before.tm_min != now.tm_min
- || before.tm_hour != now.tm_hour
- || before.tm_mon != now.tm_mon
- || before.tm_year != now.tm_year);
-
- /* Fill in the missing alarm fields using the timestamp; we
- * know there's at least one since alarm->time is invalid.
- */
- if (alarm->time.tm_sec == -1)
- alarm->time.tm_sec = now.tm_sec;
- if (alarm->time.tm_min == -1)
- alarm->time.tm_min = now.tm_min;
- if (alarm->time.tm_hour == -1)
- alarm->time.tm_hour = now.tm_hour;
-
- /* For simplicity, only support date rollover for now */
- if (alarm->time.tm_mday == -1) {
- alarm->time.tm_mday = now.tm_mday;
- missing = day;
- }
- if (alarm->time.tm_mon == -1) {
- alarm->time.tm_mon = now.tm_mon;
- if (missing == none)
- missing = month;
- }
- if (alarm->time.tm_year == -1) {
- alarm->time.tm_year = now.tm_year;
- if (missing == none)
- missing = year;
- }
-
- /* with luck, no rollover is needed */
- rtc_tm_to_time(&now, &t_now);
- rtc_tm_to_time(&alarm->time, &t_alm);
- if (t_now < t_alm)
- goto done;
-
- switch (missing) {
+ rtc_tm_to_time(&alarm->time, &scheduled);
- /* 24 hour rollover ... if it's now 10am Monday, an alarm that
- * that will trigger at 5am will do so at 5am Tuesday, which
- * could also be in the next month or year. This is a common
- * case, especially for PCs.
- */
- case day:
- dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day");
- t_alm += 24 * 60 * 60;
- rtc_time_to_tm(t_alm, &alarm->time);
- break;
-
- /* Month rollover ... if it's the 31th, an alarm on the 3rd will
- * be next month. An alarm matching on the 30th, 29th, or 28th
- * may end up in the month after that! Many newer PCs support
- * this type of alarm.
+ /* Make sure we're not setting alarms in the past */
+ err = __rtc_read_time(rtc, &tm);
+ rtc_tm_to_time(&tm, &now);
+ if (scheduled <= now)
+ return -ETIME;
+ /*
+ * XXX - We just checked to make sure the alarm time is not
+ * in the past, but there is still a race window where if
+ * the is alarm set for the next second and the second ticks
+ * over right here, before we set the alarm.
*/
- case month:
- dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month");
- do {
- if (alarm->time.tm_mon < 11)
- alarm->time.tm_mon++;
- else {
- alarm->time.tm_mon = 0;
- alarm->time.tm_year++;
- }
- days = rtc_month_days(alarm->time.tm_mon,
- alarm->time.tm_year);
- } while (days < alarm->time.tm_mday);
- break;
-
- /* Year rollover ... easy except for leap years! */
- case year:
- dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
- do {
- alarm->time.tm_year++;
- } while (rtc_valid_tm(&alarm->time) != 0);
- break;
-
- default:
- dev_warn(&rtc->dev, "alarm rollover not handled\n");
- }
-done:
- return 0;
+ if (!rtc->ops)
+ err = -ENODEV;
+ else if (!rtc->ops->set_alarm)
+ err = -EINVAL;
+ else
+ err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
+
+ return err;
}
-EXPORT_SYMBOL_GPL(rtc_read_alarm);
int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
{
@@ -300,16 +173,18 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
err = mutex_lock_interruptible(&rtc->ops_lock);
if (err)
return err;
-
- if (!rtc->ops)
- err = -ENODEV;
- else if (!rtc->ops->set_alarm)
- err = -EINVAL;
- else
- err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
-
+ if (rtc->aie_timer.enabled) {
+ rtc_timer_remove(rtc, &rtc->aie_timer);
+ rtc->aie_timer.enabled = 0;
+ }
+ rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
+ rtc->aie_timer.period = ktime_set(0, 0);
+ if (alarm->enabled) {
+ rtc->aie_timer.enabled = 1;
+ rtc_timer_enqueue(rtc, &rtc->aie_timer);
+ }
mutex_unlock(&rtc->ops_lock);
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(rtc_set_alarm);
@@ -319,6 +194,16 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
if (err)
return err;
+ if (rtc->aie_timer.enabled != enabled) {
+ if (enabled) {
+ rtc->aie_timer.enabled = 1;
+ rtc_timer_enqueue(rtc, &rtc->aie_timer);
+ } else {
+ rtc_timer_remove(rtc, &rtc->aie_timer);
+ rtc->aie_timer.enabled = 0;
+ }
+ }
+
if (!rtc->ops)
err = -ENODEV;
else if (!rtc->ops->alarm_irq_enable)
@@ -337,52 +222,53 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
if (err)
return err;
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
- if (enabled == 0 && rtc->uie_irq_active) {
- mutex_unlock(&rtc->ops_lock);
- return rtc_dev_update_irq_enable_emul(rtc, enabled);
+ /* make sure we're changing state */
+ if (rtc->uie_rtctimer.enabled == enabled)
+ goto out;
+
+ if (enabled) {
+ struct rtc_time tm;
+ ktime_t now, onesec;
+
+ __rtc_read_time(rtc, &tm);
+ onesec = ktime_set(1, 0);
+ now = rtc_tm_to_ktime(tm);
+ rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
+ rtc->uie_rtctimer.period = ktime_set(1, 0);
+ rtc->uie_rtctimer.enabled = 1;
+ rtc_timer_enqueue(rtc, &rtc->uie_rtctimer);
+ } else {
+ rtc_timer_remove(rtc, &rtc->uie_rtctimer);
+ rtc->uie_rtctimer.enabled = 0;
}
-#endif
-
- if (!rtc->ops)
- err = -ENODEV;
- else if (!rtc->ops->update_irq_enable)
- err = -EINVAL;
- else
- err = rtc->ops->update_irq_enable(rtc->dev.parent, enabled);
+out:
mutex_unlock(&rtc->ops_lock);
-
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
- /*
- * Enable emulation if the driver did not provide
- * the update_irq_enable function pointer or if returned
- * -EINVAL to signal that it has been configured without
- * interrupts or that are not available at the moment.
- */
- if (err == -EINVAL)
- err = rtc_dev_update_irq_enable_emul(rtc, enabled);
-#endif
return err;
+
}
EXPORT_SYMBOL_GPL(rtc_update_irq_enable);
+
/**
- * rtc_update_irq - report RTC periodic, alarm, and/or update irqs
- * @rtc: the rtc device
- * @num: how many irqs are being reported (usually one)
- * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
- * Context: any
+ * rtc_handle_legacy_irq - AIE, UIE and PIE event hook
+ * @rtc: pointer to the rtc device
+ *
+ * This function is called when an AIE, UIE or PIE mode interrupt
+ * has occured (or been emulated).
+ *
+ * Triggers the registered irq_task function callback.
*/
-void rtc_update_irq(struct rtc_device *rtc,
- unsigned long num, unsigned long events)
+static void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode)
{
unsigned long flags;
+ /* mark one irq of the appropriate mode */
spin_lock_irqsave(&rtc->irq_lock, flags);
- rtc->irq_data = (rtc->irq_data + (num << 8)) | events;
+ rtc->irq_data = (rtc->irq_data + (num << 8)) | (RTC_IRQF|mode);
spin_unlock_irqrestore(&rtc->irq_lock, flags);
+ /* call the task func */
spin_lock_irqsave(&rtc->irq_task_lock, flags);
if (rtc->irq_task)
rtc->irq_task->func(rtc->irq_task->private_data);
@@ -391,6 +277,69 @@ void rtc_update_irq(struct rtc_device *rtc,
wake_up_interruptible(&rtc->irq_queue);
kill_fasync(&rtc->async_queue, SIGIO, POLL_IN);
}
+
+
+/**
+ * rtc_aie_update_irq - AIE mode rtctimer hook
+ * @private: pointer to the rtc_device
+ *
+ * This functions is called when the aie_timer expires.
+ */
+void rtc_aie_update_irq(void *private)
+{
+ struct rtc_device *rtc = (struct rtc_device *)private;
+ rtc_handle_legacy_irq(rtc, 1, RTC_AF);
+}
+
+
+/**
+ * rtc_uie_update_irq - UIE mode rtctimer hook
+ * @private: pointer to the rtc_device
+ *
+ * This functions is called when the uie_timer expires.
+ */
+void rtc_uie_update_irq(void *private)
+{
+ struct rtc_device *rtc = (struct rtc_device *)private;
+ rtc_handle_legacy_irq(rtc, 1, RTC_UF);
+}
+
+
+/**
+ * rtc_pie_update_irq - PIE mode hrtimer hook
+ * @timer: pointer to the pie mode hrtimer
+ *
+ * This function is used to emulate PIE mode interrupts
+ * using an hrtimer. This function is called when the periodic
+ * hrtimer expires.
+ */
+enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer)
+{
+ struct rtc_device *rtc;
+ ktime_t period;
+ int count;
+ rtc = container_of(timer, struct rtc_device, pie_timer);
+
+ period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+ count = hrtimer_forward_now(timer, period);
+
+ rtc_handle_legacy_irq(rtc, count, RTC_PF);
+
+ return HRTIMER_RESTART;
+}
+
+/**
+ * rtc_update_irq - Triggered when a RTC interrupt occurs.
+ * @rtc: the rtc device
+ * @num: how many irqs are being reported (usually one)
+ * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF
+ * Context: any
+ */
+void rtc_update_irq(struct rtc_device *rtc,
+ unsigned long num, unsigned long events)
+{
+ schedule_work(&rtc->irqwork);
+}
EXPORT_SYMBOL_GPL(rtc_update_irq);
static int __rtc_match(struct device *dev, void *data)
@@ -477,18 +426,20 @@ int rtc_irq_set_state(struct rtc_device *rtc, struct rtc_task *task, int enabled
int err = 0;
unsigned long flags;
- if (rtc->ops->irq_set_state == NULL)
- return -ENXIO;
-
spin_lock_irqsave(&rtc->irq_task_lock, flags);
if (rtc->irq_task != NULL && task == NULL)
err = -EBUSY;
if (rtc->irq_task != task)
err = -EACCES;
- spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
- if (err == 0)
- err = rtc->ops->irq_set_state(rtc->dev.parent, enabled);
+ if (enabled) {
+ ktime_t period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+ hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL);
+ } else {
+ hrtimer_cancel(&rtc->pie_timer);
+ }
+ rtc->pie_enabled = enabled;
+ spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
return err;
}
@@ -509,21 +460,194 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq)
int err = 0;
unsigned long flags;
- if (rtc->ops->irq_set_freq == NULL)
- return -ENXIO;
-
spin_lock_irqsave(&rtc->irq_task_lock, flags);
if (rtc->irq_task != NULL && task == NULL)
err = -EBUSY;
if (rtc->irq_task != task)
err = -EACCES;
- spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
-
if (err == 0) {
- err = rtc->ops->irq_set_freq(rtc->dev.parent, freq);
- if (err == 0)
- rtc->irq_freq = freq;
+ rtc->irq_freq = freq;
+ if (rtc->pie_enabled) {
+ ktime_t period;
+ hrtimer_cancel(&rtc->pie_timer);
+ period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+ hrtimer_start(&rtc->pie_timer, period,
+ HRTIMER_MODE_REL);
+ }
}
+ spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
return err;
}
EXPORT_SYMBOL_GPL(rtc_irq_set_freq);
+
+/**
+ * rtc_timer_enqueue - Adds a rtc_timer to the rtc_device timerqueue
+ * @rtc rtc device
+ * @timer timer being added.
+ *
+ * Enqueues a timer onto the rtc devices timerqueue and sets
+ * the next alarm event appropriately.
+ *
+ * Must hold ops_lock for proper serialization of timerqueue
+ */
+void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
+{
+ timerqueue_add(&rtc->timerqueue, &timer->node);
+ if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) {
+ struct rtc_wkalrm alarm;
+ int err;
+ alarm.time = rtc_ktime_to_tm(timer->node.expires);
+ alarm.enabled = 1;
+ err = __rtc_set_alarm(rtc, &alarm);
+ if (err == -ETIME)
+ schedule_work(&rtc->irqwork);
+ }
+}
+
+/**
+ * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue
+ * @rtc rtc device
+ * @timer timer being removed.
+ *
+ * Removes a timer onto the rtc devices timerqueue and sets
+ * the next alarm event appropriately.
+ *
+ * Must hold ops_lock for proper serialization of timerqueue
+ */
+void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
+{
+ struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue);
+ timerqueue_del(&rtc->timerqueue, &timer->node);
+
+ if (next == &timer->node) {
+ struct rtc_wkalrm alarm;
+ int err;
+ next = timerqueue_getnext(&rtc->timerqueue);
+ if (!next)
+ return;
+ alarm.time = rtc_ktime_to_tm(next->expires);
+ alarm.enabled = 1;
+ err = __rtc_set_alarm(rtc, &alarm);
+ if (err == -ETIME)
+ schedule_work(&rtc->irqwork);
+ }
+}
+
+/**
+ * rtc_timer_do_work - Expires rtc timers
+ * @rtc rtc device
+ * @timer timer being removed.
+ *
+ * Expires rtc timers. Reprograms next alarm event if needed.
+ * Called via worktask.
+ *
+ * Serializes access to timerqueue via ops_lock mutex
+ */
+void rtc_timer_do_work(struct work_struct *work)
+{
+ struct rtc_timer *timer;
+ struct timerqueue_node *next;
+ ktime_t now;
+ struct rtc_time tm;
+
+ struct rtc_device *rtc =
+ container_of(work, struct rtc_device, irqwork);
+
+ mutex_lock(&rtc->ops_lock);
+again:
+ __rtc_read_time(rtc, &tm);
+ now = rtc_tm_to_ktime(tm);
+ while ((next = timerqueue_getnext(&rtc->timerqueue))) {
+ if (next->expires.tv64 > now.tv64)
+ break;
+
+ /* expire timer */
+ timer = container_of(next, struct rtc_timer, node);
+ timerqueue_del(&rtc->timerqueue, &timer->node);
+ timer->enabled = 0;
+ if (timer->task.func)
+ timer->task.func(timer->task.private_data);
+
+ /* Re-add/fwd periodic timers */
+ if (ktime_to_ns(timer->period)) {
+ timer->node.expires = ktime_add(timer->node.expires,
+ timer->period);
+ timer->enabled = 1;
+ timerqueue_add(&rtc->timerqueue, &timer->node);
+ }
+ }
+
+ /* Set next alarm */
+ if (next) {
+ struct rtc_wkalrm alarm;
+ int err;
+ alarm.time = rtc_ktime_to_tm(next->expires);
+ alarm.enabled = 1;
+ err = __rtc_set_alarm(rtc, &alarm);
+ if (err == -ETIME)
+ goto again;
+ }
+
+ mutex_unlock(&rtc->ops_lock);
+}
+
+
+/* rtc_timer_init - Initializes an rtc_timer
+ * @timer: timer to be intiialized
+ * @f: function pointer to be called when timer fires
+ * @data: private data passed to function pointer
+ *
+ * Kernel interface to initializing an rtc_timer.
+ */
+void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data)
+{
+ timerqueue_init(&timer->node);
+ timer->enabled = 0;
+ timer->task.func = f;
+ timer->task.private_data = data;
+}
+
+/* rtc_timer_start - Sets an rtc_timer to fire in the future
+ * @ rtc: rtc device to be used
+ * @ timer: timer being set
+ * @ expires: time at which to expire the timer
+ * @ period: period that the timer will recur
+ *
+ * Kernel interface to set an rtc_timer
+ */
+int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+ ktime_t expires, ktime_t period)
+{
+ int ret = 0;
+ mutex_lock(&rtc->ops_lock);
+ if (timer->enabled)
+ rtc_timer_remove(rtc, timer);
+
+ timer->node.expires = expires;
+ timer->period = period;
+
+ timer->enabled = 1;
+ rtc_timer_enqueue(rtc, timer);
+
+ mutex_unlock(&rtc->ops_lock);
+ return ret;
+}
+
+/* rtc_timer_cancel - Stops an rtc_timer
+ * @ rtc: rtc device to be used
+ * @ timer: timer being set
+ *
+ * Kernel interface to cancel an rtc_timer
+ */
+int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer)
+{
+ int ret = 0;
+ mutex_lock(&rtc->ops_lock);
+ if (timer->enabled)
+ rtc_timer_remove(rtc, timer);
+ timer->enabled = 0;
+ mutex_unlock(&rtc->ops_lock);
+ return ret;
+}
+
+
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 0cc0984..212b16e 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -46,105 +46,6 @@ static int rtc_dev_open(struct inode *inode, struct file *file)
return err;
}
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-/*
- * Routine to poll RTC seconds field for change as often as possible,
- * after first RTC_UIE use timer to reduce polling
- */
-static void rtc_uie_task(struct work_struct *work)
-{
- struct rtc_device *rtc =
- container_of(work, struct rtc_device, uie_task);
- struct rtc_time tm;
- int num = 0;
- int err;
-
- err = rtc_read_time(rtc, &tm);
-
- spin_lock_irq(&rtc->irq_lock);
- if (rtc->stop_uie_polling || err) {
- rtc->uie_task_active = 0;
- } else if (rtc->oldsecs != tm.tm_sec) {
- num = (tm.tm_sec + 60 - rtc->oldsecs) % 60;
- rtc->oldsecs = tm.tm_sec;
- rtc->uie_timer.expires = jiffies + HZ - (HZ/10);
- rtc->uie_timer_active = 1;
- rtc->uie_task_active = 0;
- add_timer(&rtc->uie_timer);
- } else if (schedule_work(&rtc->uie_task) == 0) {
- rtc->uie_task_active = 0;
- }
- spin_unlock_irq(&rtc->irq_lock);
- if (num)
- rtc_update_irq(rtc, num, RTC_UF | RTC_IRQF);
-}
-static void rtc_uie_timer(unsigned long data)
-{
- struct rtc_device *rtc = (struct rtc_device *)data;
- unsigned long flags;
-
- spin_lock_irqsave(&rtc->irq_lock, flags);
- rtc->uie_timer_active = 0;
- rtc->uie_task_active = 1;
- if ((schedule_work(&rtc->uie_task) == 0))
- rtc->uie_task_active = 0;
- spin_unlock_irqrestore(&rtc->irq_lock, flags);
-}
-
-static int clear_uie(struct rtc_device *rtc)
-{
- spin_lock_irq(&rtc->irq_lock);
- if (rtc->uie_irq_active) {
- rtc->stop_uie_polling = 1;
- if (rtc->uie_timer_active) {
- spin_unlock_irq(&rtc->irq_lock);
- del_timer_sync(&rtc->uie_timer);
- spin_lock_irq(&rtc->irq_lock);
- rtc->uie_timer_active = 0;
- }
- if (rtc->uie_task_active) {
- spin_unlock_irq(&rtc->irq_lock);
- flush_work_sync(&rtc->uie_task);
- spin_lock_irq(&rtc->irq_lock);
- }
- rtc->uie_irq_active = 0;
- }
- spin_unlock_irq(&rtc->irq_lock);
- return 0;
-}
-
-static int set_uie(struct rtc_device *rtc)
-{
- struct rtc_time tm;
- int err;
-
- err = rtc_read_time(rtc, &tm);
- if (err)
- return err;
- spin_lock_irq(&rtc->irq_lock);
- if (!rtc->uie_irq_active) {
- rtc->uie_irq_active = 1;
- rtc->stop_uie_polling = 0;
- rtc->oldsecs = tm.tm_sec;
- rtc->uie_task_active = 1;
- if (schedule_work(&rtc->uie_task) == 0)
- rtc->uie_task_active = 0;
- }
- rtc->irq_data = 0;
- spin_unlock_irq(&rtc->irq_lock);
- return 0;
-}
-
-int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, unsigned int enabled)
-{
- if (enabled)
- return set_uie(rtc);
- else
- return clear_uie(rtc);
-}
-EXPORT_SYMBOL(rtc_dev_update_irq_enable_emul);
-
-#endif /* CONFIG_RTC_INTF_DEV_UIE_EMUL */
static ssize_t
rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
@@ -493,11 +394,6 @@ void rtc_dev_prepare(struct rtc_device *rtc)
rtc->dev.devt = MKDEV(MAJOR(rtc_devt), rtc->id);
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
- INIT_WORK(&rtc->uie_task, rtc_uie_task);
- setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
-#endif
-
cdev_init(&rtc->char_dev, &rtc_dev_fops);
rtc->char_dev.owner = rtc->owner;
}
diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
index 773851f..075f170 100644
--- a/drivers/rtc/rtc-lib.c
+++ b/drivers/rtc/rtc-lib.c
@@ -117,4 +117,32 @@ int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time)
}
EXPORT_SYMBOL(rtc_tm_to_time);
+/*
+ * Convert rtc_time to ktime
+ */
+ktime_t rtc_tm_to_ktime(struct rtc_time tm)
+{
+ time_t time;
+ rtc_tm_to_time(&tm, &time);
+ return ktime_set(time, 0);
+}
+EXPORT_SYMBOL_GPL(rtc_tm_to_ktime);
+
+/*
+ * Convert ktime to rtc_time
+ */
+struct rtc_time rtc_ktime_to_tm(ktime_t kt)
+{
+ struct timespec ts;
+ struct rtc_time ret;
+
+ ts = ktime_to_timespec(kt);
+ /* Round up any ns */
+ if (ts.tv_nsec)
+ ts.tv_sec++;
+ rtc_time_to_tm(ts.tv_sec, &ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rtc_ktime_to_tm);
+
MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index dea7b5b..24b966d 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -469,7 +469,7 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
unsigned long rom_pl;
static int die_nmi_called;
- if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
+ if (ulReason != DIE_NMIUNKNOWN)
goto out;
if (!hpwdt_nmi_decoding)
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d84066..ab152c0 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -51,7 +51,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
config OCFS2_FS_STATS
bool "OCFS2 statistics"
- depends on OCFS2_FS
+ depends on OCFS2_FS && DEBUG_FS
default y
help
This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5..e4984e2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
return ret;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
+ osb->truncated_clusters += num_clusters;
bail:
mlog_exit(status);
return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
i--;
}
+ osb->truncated_clusters = 0;
+
bail:
mlog_exit(status);
return status;
@@ -7139,64 +7141,6 @@ bail:
}
/*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc)
-{
- int status;
- unsigned int new_i_clusters;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct buffer_head *last_eb_bh = NULL;
-
- mlog_entry_void();
-
- *tc = NULL;
-
- new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- i_size_read(inode));
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
- "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
- (unsigned long long)le64_to_cpu(fe->i_size));
-
- *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
- if (!(*tc)) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-
- if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_extent_block(INODE_CACHE(inode),
- le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- (*tc)->tc_last_eb_bh = last_eb_bh;
-
- status = 0;
-bail:
- if (status < 0) {
- if (*tc)
- ocfs2_free_truncate_context(*tc);
- *tc = NULL;
- }
- mlog_exit_void();
- return status;
-}
-
-/*
* 'start' is inclusive, 'end' is not.
*/
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
out:
return ret;
}
-
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
-{
- /*
- * The caller is responsible for completing deallocation
- * before freeing the context.
- */
- if (tc->tc_dealloc.c_first_suballocator != NULL)
- mlog(ML_NOTICE,
- "Truncate completion has non-empty dealloc context\n");
-
- brelse(tc->tc_last_eb_bh);
-
- kfree(tc);
-}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b5..3bd08a0 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d7c554..1fbb0e2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1630,6 +1630,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ mutex_lock(&osb->osb_tl_inode->i_mutex);
+ truncated_clusters = osb->truncated_clusters;
+ mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct file *filp,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1637,7 +1674,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
struct buffer_head *di_bh, struct page *mmap_page)
{
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
- unsigned int clusters_to_alloc, extents_to_split;
+ unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
struct ocfs2_write_ctxt *wc;
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1646,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int try_free = 1, ret1;
+try_again:
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1681,6 +1720,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
mlog_errno(ret);
goto out;
} else if (ret == 1) {
+ clusters_need = wc->w_clen;
ret = ocfs2_refcount_cow(inode, filp, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
@@ -1695,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
mlog_errno(ret);
goto out;
}
+ clusters_need += clusters_to_alloc;
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
@@ -1817,6 +1858,22 @@ out:
ocfs2_free_alloc_context(data_ac);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
+
+ if (ret == -ENOSPC && try_free) {
+ /*
+ * Try to free some truncate log so that we can have enough
+ * clusters to allocate.
+ */
+ try_free = 0;
+
+ ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+ if (ret1 == 1)
+ goto try_again;
+
+ if (ret1 < 0)
+ mlog_errno(ret1);
+ }
+
return ret;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9e3d45b..a6cc053 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -82,6 +82,7 @@ static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
#define O2HB_DB_TYPE_REGION_LIVENODES 4
#define O2HB_DB_TYPE_REGION_NUMBER 5
#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+#define O2HB_DB_TYPE_REGION_PINNED 7
struct o2hb_debug_buf {
int db_type;
int db_size;
@@ -101,6 +102,7 @@ static struct o2hb_debug_buf *o2hb_db_failedregions;
#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
#define O2HB_DEBUG_REGION_NUMBER "num"
#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED "pinned"
static struct dentry *o2hb_debug_dir;
static struct dentry *o2hb_debug_livenodes;
@@ -132,6 +134,33 @@ char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF 3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
/* Only sets a new threshold if there are no active regions.
*
* No locking or otherwise interesting code is required for reading
@@ -186,7 +215,9 @@ struct o2hb_region {
struct config_item hr_item;
struct list_head hr_all_item;
- unsigned hr_unclean_stop:1;
+ unsigned hr_unclean_stop:1,
+ hr_item_pinned:1,
+ hr_item_dropped:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -212,9 +243,11 @@ struct o2hb_region {
struct dentry *hr_debug_livenodes;
struct dentry *hr_debug_regnum;
struct dentry *hr_debug_elapsed_time;
+ struct dentry *hr_debug_pinned;
struct o2hb_debug_buf *hr_db_livenodes;
struct o2hb_debug_buf *hr_db_regnum;
struct o2hb_debug_buf *hr_db_elapsed_time;
+ struct o2hb_debug_buf *hr_db_pinned;
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
@@ -701,6 +734,14 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
config_item_name(&reg->hr_item));
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+ /*
+ * If global heartbeat active, unpin all regions if the
+ * region count > CUT_OFF
+ */
+ if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+ o2hb_region_unpin(NULL);
}
static int o2hb_check_slot(struct o2hb_region *reg,
@@ -1041,6 +1082,9 @@ static int o2hb_thread(void *data)
set_user_nice(current, -20);
+ /* Pin node */
+ o2nm_depend_this_node();
+
while (!kthread_should_stop() && !reg->hr_unclean_stop) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
@@ -1090,6 +1134,9 @@ static int o2hb_thread(void *data)
mlog_errno(ret);
}
+ /* Unpin node */
+ o2nm_undepend_this_node();
+
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
return 0;
@@ -1142,6 +1189,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
reg->hr_last_timeout_start));
goto done;
+ case O2HB_DB_TYPE_REGION_PINNED:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ !!reg->hr_item_pinned);
+ goto done;
+
default:
goto done;
}
@@ -1315,6 +1368,8 @@ int o2hb_init(void)
memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ o2hb_dependent_users = 0;
+
return o2hb_debug_init();
}
@@ -1384,6 +1439,7 @@ static void o2hb_region_release(struct config_item *item)
debugfs_remove(reg->hr_debug_livenodes);
debugfs_remove(reg->hr_debug_regnum);
debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_pinned);
debugfs_remove(reg->hr_debug_dir);
spin_lock(&o2hb_live_lock);
@@ -1948,6 +2004,18 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
goto bail;
}
+ reg->hr_debug_pinned =
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+ reg->hr_debug_dir,
+ &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED,
+ 0, 0, reg);
+ if (!reg->hr_debug_pinned) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
ret = 0;
bail:
return ret;
@@ -2002,15 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
{
struct task_struct *hb_task;
struct o2hb_region *reg = to_o2hb_region(item);
+ int quorum_region = 0;
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
if (o2hb_global_heartbeat_active()) {
clear_bit(reg->hr_region_num, o2hb_region_bitmap);
clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
}
hb_task = reg->hr_task;
reg->hr_task = NULL;
+ reg->hr_item_dropped = 1;
spin_unlock(&o2hb_live_lock);
if (hb_task)
@@ -2028,7 +2101,27 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (o2hb_global_heartbeat_active())
printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
config_item_name(&reg->hr_item));
+
config_item_put(item);
+
+ if (!o2hb_global_heartbeat_active() || !quorum_region)
+ return;
+
+ /*
+ * If global heartbeat active and there are dependent users,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ spin_lock(&o2hb_live_lock);
+
+ if (!o2hb_dependent_users)
+ goto unlock;
+
+ if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
struct o2hb_heartbeat_group_attribute {
@@ -2214,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
}
EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
{
- struct o2hb_region *p, *reg = NULL;
+ int ret = 0, found = 0;
+ struct o2hb_region *reg;
+ char *uuid;
assert_spin_locked(&o2hb_live_lock);
- list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
- if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
- reg = p;
- break;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ uuid = config_item_name(&reg->hr_item);
+
+ /* local heartbeat */
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
+
+ if (reg->hr_item_pinned || reg->hr_item_dropped)
+ goto skip_pin;
+
+ /* Ignore ENOENT only for local hb (userdlm domain) */
+ ret = o2nm_depend_item(&reg->hr_item);
+ if (!ret) {
+ mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+ reg->hr_item_pinned = 1;
+ } else {
+ if (ret == -ENOENT && found)
+ ret = 0;
+ else {
+ mlog(ML_ERROR, "Pin region %s fails with %d\n",
+ uuid, ret);
+ break;
+ }
}
+skip_pin:
+ if (found)
+ break;
}
- return reg;
+ return ret;
}
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
{
- int ret = 0;
struct o2hb_region *reg;
+ char *uuid;
+ int found = 0;
- spin_lock(&o2hb_live_lock);
+ assert_spin_locked(&o2hb_live_lock);
- reg = o2hb_find_region(region_uuid);
- if (!reg)
- ret = -ENOENT;
- spin_unlock(&o2hb_live_lock);
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ uuid = config_item_name(&reg->hr_item);
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
- if (ret)
- goto out;
+ if (reg->hr_item_pinned) {
+ mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+ o2nm_undepend_item(&reg->hr_item);
+ reg->hr_item_pinned = 0;
+ }
+ if (found)
+ break;
+ }
+}
- ret = o2nm_depend_this_node();
- if (ret)
- goto out;
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+ int ret = 0;
- ret = o2nm_depend_item(&reg->hr_item);
- if (ret)
- o2nm_undepend_this_node();
+ spin_lock(&o2hb_live_lock);
-out:
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ ret = o2hb_region_pin(region_uuid);
+ goto unlock;
+ }
+
+ /*
+ * if global heartbeat active and this is the first dependent user,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ o2hb_dependent_users++;
+ if (o2hb_dependent_users > 1)
+ goto unlock;
+
+ if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ ret = o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
return ret;
}
-static void o2hb_region_put(const char *region_uuid)
+void o2hb_region_dec_user(const char *region_uuid)
{
- struct o2hb_region *reg;
-
spin_lock(&o2hb_live_lock);
- reg = o2hb_find_region(region_uuid);
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ o2hb_region_unpin(region_uuid);
+ goto unlock;
+ }
- spin_unlock(&o2hb_live_lock);
+ /*
+ * if global heartbeat active and there are no dependent users,
+ * unpin all quorum regions
+ */
+ o2hb_dependent_users--;
+ if (!o2hb_dependent_users)
+ o2hb_region_unpin(NULL);
- if (reg) {
- o2nm_undepend_item(&reg->hr_item);
- o2nm_undepend_this_node();
- }
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
int o2hb_register_callback(const char *region_uuid,
@@ -2291,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
}
if (region_uuid) {
- ret = o2hb_region_get(region_uuid);
- if (ret)
+ ret = o2hb_region_inc_user(region_uuid);
+ if (ret) {
+ mlog_errno(ret);
goto out;
+ }
}
down_write(&o2hb_callback_sem);
@@ -2311,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
up_write(&o2hb_callback_sem);
ret = 0;
out:
- mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
ret, __builtin_return_address(0), hc);
return ret;
}
@@ -2322,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
{
BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
- mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
__builtin_return_address(0), hc);
/* XXX Can this happen _with_ a region reference? */
@@ -2330,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
return;
if (region_uuid)
- o2hb_region_put(region_uuid);
+ o2hb_region_dec_user(region_uuid);
down_write(&o2hb_callback_sem);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e..3a583590 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
#define O2NET_DEBUG_DIR "o2net"
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
+#define STATS_DEBUG_NAME "stats"
+
+#define SHOW_SOCK_CONTAINERS 0
+#define SHOW_SOCK_STATS 1
static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int nst_seq_show(struct seq_file *seq, void *v)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+ ktime_t now;
+ s64 sock, send, status;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
+ if (!nst)
+ goto out;
- if (nst != NULL) {
- /* get_task_comm isn't exported. oh well. */
- seq_printf(seq, "%p:\n"
- " pid: %lu\n"
- " tgid: %lu\n"
- " process name: %s\n"
- " node: %u\n"
- " sc: %p\n"
- " message id: %d\n"
- " message type: %u\n"
- " message key: 0x%08x\n"
- " sock acquiry: %lu.%ld\n"
- " send start: %lu.%ld\n"
- " wait start: %lu.%ld\n",
- nst, (unsigned long)nst->st_task->pid,
- (unsigned long)nst->st_task->tgid,
- nst->st_task->comm, nst->st_node,
- nst->st_sc, nst->st_id, nst->st_msg_type,
- nst->st_msg_key,
- nst->st_sock_time.tv_sec,
- (long)nst->st_sock_time.tv_usec,
- nst->st_send_time.tv_sec,
- (long)nst->st_send_time.tv_usec,
- nst->st_status_time.tv_sec,
- (long)nst->st_status_time.tv_usec);
- }
+ now = ktime_get();
+ sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+ send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+ status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+ /* get_task_comm isn't exported. oh well. */
+ seq_printf(seq, "%p:\n"
+ " pid: %lu\n"
+ " tgid: %lu\n"
+ " process name: %s\n"
+ " node: %u\n"
+ " sc: %p\n"
+ " message id: %d\n"
+ " message type: %u\n"
+ " message key: 0x%08x\n"
+ " sock acquiry: %lld usecs ago\n"
+ " send start: %lld usecs ago\n"
+ " wait start: %lld usecs ago\n",
+ nst, (unsigned long)task_pid_nr(nst->st_task),
+ (unsigned long)nst->st_task->tgid,
+ nst->st_task->comm, nst->st_node,
+ nst->st_sc, nst->st_id, nst->st_msg_type,
+ nst->st_msg_key,
+ (long long)sock,
+ (long long)send,
+ (long long)status);
+out:
spin_unlock(&o2net_debug_lock);
return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
spin_unlock(&o2net_debug_lock);
}
+struct o2net_sock_debug {
+ int dbg_ctxt;
+ struct o2net_sock_container *dbg_sock;
+};
+
static struct o2net_sock_container
*next_sc(struct o2net_sock_container *sc_start)
{
@@ -253,7 +268,8 @@ static struct o2net_sock_container
static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return sc; /* unused, just needs to be null when done */
}
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s) ((_s)->sc_send_count)
+# define sc_recv_count(_s) ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s) (0U)
+# define sc_recv_count(_s) (0U)
+# define sc_tv_acquiry_total_ns(_s) (0LL)
+# define sc_tv_send_total_ns(_s) (0LL)
+# define sc_tv_status_total_ns(_s) (0LL)
+# define sc_tv_process_total_ns(_s) (0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION 1
+static void sc_show_sock_stats(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ if (!sc)
+ return;
+
+ seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+ sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+ (long long)sc_tv_acquiry_total_ns(sc),
+ (long long)sc_tv_send_total_ns(sc),
+ (long long)sc_tv_status_total_ns(sc),
+ (unsigned long)sc_recv_count(sc),
+ (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ struct inet_sock *inet = NULL;
+ __be32 saddr = 0, daddr = 0;
+ __be16 sport = 0, dport = 0;
+
+ if (!sc)
+ return;
+
+ if (sc->sc_sock) {
+ inet = inet_sk(sc->sc_sock->sk);
+ /* the stack's structs aren't sparse endian clean */
+ saddr = (__force __be32)inet->inet_saddr;
+ daddr = (__force __be32)inet->inet_daddr;
+ sport = (__force __be16)inet->inet_sport;
+ dport = (__force __be16)inet->inet_dport;
+ }
+
+ /* XXX sigh, inet-> doesn't have sparse annotation so any
+ * use of it here generates a warning with -Wbitwise */
+ seq_printf(seq, "%p:\n"
+ " krefs: %d\n"
+ " sock: %pI4:%u -> "
+ "%pI4:%u\n"
+ " remote node: %s\n"
+ " page off: %zu\n"
+ " handshake ok: %u\n"
+ " timer: %lld usecs\n"
+ " data ready: %lld usecs\n"
+ " advance start: %lld usecs\n"
+ " advance stop: %lld usecs\n"
+ " func start: %lld usecs\n"
+ " func stop: %lld usecs\n"
+ " func key: 0x%08x\n"
+ " func type: %u\n",
+ sc,
+ atomic_read(&sc->sc_kref.refcount),
+ &saddr, inet ? ntohs(sport) : 0,
+ &daddr, inet ? ntohs(dport) : 0,
+ sc->sc_node->nd_name,
+ sc->sc_page_off,
+ sc->sc_handshake_ok,
+ (long long)ktime_to_us(sc->sc_tv_timer),
+ (long long)ktime_to_us(sc->sc_tv_data_ready),
+ (long long)ktime_to_us(sc->sc_tv_advance_start),
+ (long long)ktime_to_us(sc->sc_tv_advance_stop),
+ (long long)ktime_to_us(sc->sc_tv_func_start),
+ (long long)ktime_to_us(sc->sc_tv_func_stop),
+ sc->sc_msg_key,
+ sc->sc_msg_type);
+}
static int sc_seq_show(struct seq_file *seq, void *v)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
- if (sc != NULL) {
- struct inet_sock *inet = NULL;
-
- __be32 saddr = 0, daddr = 0;
- __be16 sport = 0, dport = 0;
-
- if (sc->sc_sock) {
- inet = inet_sk(sc->sc_sock->sk);
- /* the stack's structs aren't sparse endian clean */
- saddr = (__force __be32)inet->inet_saddr;
- daddr = (__force __be32)inet->inet_daddr;
- sport = (__force __be16)inet->inet_sport;
- dport = (__force __be16)inet->inet_dport;
- }
-
- /* XXX sigh, inet-> doesn't have sparse annotation so any
- * use of it here generates a warning with -Wbitwise */
- seq_printf(seq, "%p:\n"
- " krefs: %d\n"
- " sock: %pI4:%u -> "
- "%pI4:%u\n"
- " remote node: %s\n"
- " page off: %zu\n"
- " handshake ok: %u\n"
- " timer: %lu.%ld\n"
- " data ready: %lu.%ld\n"
- " advance start: %lu.%ld\n"
- " advance stop: %lu.%ld\n"
- " func start: %lu.%ld\n"
- " func stop: %lu.%ld\n"
- " func key: %u\n"
- " func type: %u\n",
- sc,
- atomic_read(&sc->sc_kref.refcount),
- &saddr, inet ? ntohs(sport) : 0,
- &daddr, inet ? ntohs(dport) : 0,
- sc->sc_node->nd_name,
- sc->sc_page_off,
- sc->sc_handshake_ok,
- TV_SEC_USEC(sc->sc_tv_timer),
- TV_SEC_USEC(sc->sc_tv_data_ready),
- TV_SEC_USEC(sc->sc_tv_advance_start),
- TV_SEC_USEC(sc->sc_tv_advance_stop),
- TV_SEC_USEC(sc->sc_tv_func_start),
- TV_SEC_USEC(sc->sc_tv_func_stop),
- sc->sc_msg_key,
- sc->sc_msg_type);
+ if (sc) {
+ if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+ sc_show_sock_container(seq, sc);
+ else
+ sc_show_sock_stats(seq, sc);
}
-
spin_unlock(&o2net_debug_lock);
return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
.show = sc_seq_show,
};
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
{
struct o2net_sock_container *dummy_sc;
struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
goto out;
seq = file->private_data;
- seq->private = dummy_sc;
+ seq->private = sd;
+ sd->dbg_sock = dummy_sc;
o2net_debug_add_sc(dummy_sc);
dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
static int sc_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- struct o2net_sock_container *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
return seq_release_private(inode, file);
}
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_STATS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+ .open = stats_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
goto bail;
}
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
+ o2net_dentry, NULL,
+ &stats_seq_fops);
+ if (!stats_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
return 0;
bail:
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
- if (o2net_dentry)
- debugfs_remove(o2net_dentry);
+ debugfs_remove(stats_dentry);
+ debugfs_remove(sc_dentry);
+ debugfs_remove(nst_dentry);
+ debugfs_remove(o2net_dentry);
return -ENOMEM;
}
void o2net_debugfs_exit(void)
{
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
- if (o2net_dentry)
- debugfs_remove(o2net_dentry);
+ debugfs_remove(stats_dentry);
+ debugfs_remove(sc_dentry);
+ debugfs_remove(nst_dentry);
+ debugfs_remove(o2net_dentry);
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9aa426e..3b11cb1 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
nst->st_node = node;
}
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_sock_time);
+ nst->st_sock_time = ktime_get();
}
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_send_time);
+ nst->st_send_time = ktime_get();
}
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_status_time);
+ nst->st_status_time = ktime_get();
}
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
{
nst->st_sc = sc;
}
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
{
nst->st_id = msg_id;
}
-#else /* CONFIG_DEBUG_FS */
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
{
+ sc->sc_tv_timer = ktime_get();
}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_data_ready = ktime_get();
}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_start = ktime_get();
}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_stop = ktime_get();
}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_start = ktime_get();
}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
- u32 msg_id)
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_stop = ktime_get();
}
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+ return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+#else /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+# define o2net_get_func_run_time(a) (ktime_t)0
#endif /* CONFIG_DEBUG_FS */
+#ifdef CONFIG_OCFS2_FS_STATS
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+ sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+ ktime_sub(ktime_get(),
+ nst->st_status_time));
+ sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+ ktime_sub(nst->st_status_time,
+ nst->st_send_time));
+ sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+ ktime_sub(nst->st_send_time,
+ nst->st_sock_time));
+ sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+ o2net_get_func_run_time(sc));
+ sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
static inline int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
sc->sc_sock = NULL;
}
+ o2nm_undepend_item(&sc->sc_node->nd_item);
o2nm_node_put(sc->sc_node);
sc->sc_node = NULL;
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
{
struct o2net_sock_container *sc, *ret = NULL;
struct page *page = NULL;
+ int status = 0;
page = alloc_page(GFP_NOFS);
sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
o2nm_node_get(node);
sc->sc_node = node;
+ /* pin the node item of the remote node */
+ status = o2nm_depend_item(&node->nd_item);
+ if (status) {
+ mlog_errno(status);
+ o2nm_node_put(node);
+ goto out;
+ }
INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
if (sk->sk_user_data) {
struct o2net_sock_container *sc = sk->sk_user_data;
sclog(sc, "data_ready hit\n");
- do_gettimeofday(&sc->sc_tv_data_ready);
+ o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
ready = sc->sc_data_ready;
} else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
o2net_set_nst_status_time(&nst);
wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+ o2net_update_send_stats(&nst, sc);
+
/* Note that we avoid overwriting the callers status return
* variable if a system error was reported on the other
* side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
if (syserr != O2NET_ERR_NONE)
goto out_respond;
- do_gettimeofday(&sc->sc_tv_func_start);
+ o2net_set_func_start_time(sc);
sc->sc_msg_key = be32_to_cpu(hdr->key);
sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
be16_to_cpu(hdr->data_len),
nmh->nh_func_data, &ret_data);
- do_gettimeofday(&sc->sc_tv_func_stop);
+ o2net_set_func_stop_time(sc);
+
+ o2net_update_recv_stats(sc);
out_respond:
/* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
size_t datalen;
sclog(sc, "receiving\n");
- do_gettimeofday(&sc->sc_tv_advance_start);
+ o2net_set_advance_start_time(sc);
if (unlikely(sc->sc_handshake_ok == 0)) {
if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
out:
sclog(sc, "ret = %d\n", ret);
- do_gettimeofday(&sc->sc_tv_advance_stop);
+ o2net_set_advance_stop_time(sc);
return ret;
}
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
- struct timeval now;
- do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+ ktime_t now = ktime_get();
+#endif
printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
"seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
- mlog(ML_NOTICE, "here are some times that might help debug the "
- "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
- "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
- sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
- now.tv_sec, (long) now.tv_usec,
- sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
- sc->sc_tv_advance_start.tv_sec,
- (long) sc->sc_tv_advance_start.tv_usec,
- sc->sc_tv_advance_stop.tv_sec,
- (long) sc->sc_tv_advance_stop.tv_usec,
+
+#ifdef CONFIG_DEBUG_FS
+ mlog(ML_NOTICE, "Here are some times that might help debug the "
+ "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
+ "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
+ (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
+ (long long)ktime_to_us(sc->sc_tv_data_ready),
+ (long long)ktime_to_us(sc->sc_tv_advance_start),
+ (long long)ktime_to_us(sc->sc_tv_advance_stop),
sc->sc_msg_key, sc->sc_msg_type,
- sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
- sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+ (long long)ktime_to_us(sc->sc_tv_func_start),
+ (long long)ktime_to_us(sc->sc_tv_func_stop));
+#endif
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
msecs_to_jiffies(o2net_keepalive_delay()));
- do_gettimeofday(&sc->sc_tv_timer);
+ o2net_set_sock_timer(sc);
mod_timer(&sc->sc_idle_timeout,
jiffies + msecs_to_jiffies(o2net_idle_timeout()));
}
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 15fdbdf..4cbcb65 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,18 +166,27 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
void (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
- struct list_head sc_net_debug_item;
-#endif
- struct timeval sc_tv_timer;
- struct timeval sc_tv_data_ready;
- struct timeval sc_tv_advance_start;
- struct timeval sc_tv_advance_stop;
- struct timeval sc_tv_func_start;
- struct timeval sc_tv_func_stop;
+
u32 sc_msg_key;
u16 sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+ struct list_head sc_net_debug_item;
+ ktime_t sc_tv_timer;
+ ktime_t sc_tv_data_ready;
+ ktime_t sc_tv_advance_start;
+ ktime_t sc_tv_advance_stop;
+ ktime_t sc_tv_func_start;
+ ktime_t sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+ ktime_t sc_tv_acquiry_total;
+ ktime_t sc_tv_send_total;
+ ktime_t sc_tv_status_total;
+ u32 sc_send_count;
+ u32 sc_recv_count;
+ ktime_t sc_tv_process_total;
+#endif
struct mutex sc_send_lock;
};
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
u32 st_msg_type;
u32 st_msg_key;
u8 st_node;
- struct timeval st_sock_time;
- struct timeval st_send_time;
- struct timeval st_status_time;
+ ktime_t st_sock_time;
+ ktime_t st_send_time;
+ ktime_t st_status_time;
};
#else
struct o2net_send_tracking {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f449991..3a3ed4b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+ res = lock->lockres;
+
assert_spin_locked(&dlm->ast_lock);
+
if (!list_empty(&lock->ast_list)) {
- mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+ "AST list not empty, pending %d, newlevel %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ast_pending, lock->ml.type);
BUG();
}
if (lock->ast_pending)
- mlog(0, "lock has an ast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
- struct dlm_lock_resource *res = lock->lockres;
- mlog(0, "%s: cancelling bast for %.*s\n",
- dlm->name, res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+
assert_spin_locked(&dlm->ast_lock);
+ res = lock->lockres;
+
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
- mlog(0, "lock has a bast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lockstatus *lksb;
int lksbflags;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
{
dlm_bastlockfunc_t *fn = lock->bast;
- mlog_entry_void();
BUG_ON(lock->ml.node != dlm->node_num);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ blocked_type);
+
(*fn)(lock->astdata, blocked_type);
}
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
- mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: Adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ lock->ml.type, lock->ml.convert_type);
+
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
size_t veclen = 1;
int status;
- mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
- res->lockname.len, res->lockname.name, lock->ml.node,
- msg_type, blocked_type);
+ mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+ blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
- mlog(0, "returning requested LVB data\n");
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
- mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
- "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+ mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name, ret,
lock->ml.node);
else {
if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index b36d0bf..4bdf7ba 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
enum dlm_mle_type {
- DLM_MLE_BLOCK,
- DLM_MLE_MASTER,
- DLM_MLE_MIGRATION,
- DLM_MLE_NUM_TYPES
+ DLM_MLE_BLOCK = 0,
+ DLM_MLE_MASTER = 1,
+ DLM_MLE_MIGRATION = 2,
+ DLM_MLE_NUM_TYPES = 3,
};
struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
enum dlm_ast_type {
DLM_AST = 0,
- DLM_BAST,
- DLM_ASTUNLOCK
+ DLM_BAST = 1,
+ DLM_ASTUNLOCK = 2,
};
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
enum dlm_ctxt_state {
DLM_CTXT_NEW = 0,
- DLM_CTXT_JOINED,
- DLM_CTXT_IN_SHUTDOWN,
- DLM_CTXT_LEAVING,
+ DLM_CTXT_JOINED = 1,
+ DLM_CTXT_IN_SHUTDOWN = 2,
+ DLM_CTXT_LEAVING = 3,
};
struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
- DLM_CONVERTING_LIST,
- DLM_BLOCKED_LIST
+ DLM_CONVERTING_LIST = 1,
+ DLM_BLOCKED_LIST = 2,
};
static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,27 +427,27 @@ struct dlm_node_iter
enum {
- DLM_MASTER_REQUEST_MSG = 500,
- DLM_UNUSED_MSG1, /* 501 */
- DLM_ASSERT_MASTER_MSG, /* 502 */
- DLM_CREATE_LOCK_MSG, /* 503 */
- DLM_CONVERT_LOCK_MSG, /* 504 */
- DLM_PROXY_AST_MSG, /* 505 */
- DLM_UNLOCK_LOCK_MSG, /* 506 */
- DLM_DEREF_LOCKRES_MSG, /* 507 */
- DLM_MIGRATE_REQUEST_MSG, /* 508 */
- DLM_MIG_LOCKRES_MSG, /* 509 */
- DLM_QUERY_JOIN_MSG, /* 510 */
- DLM_ASSERT_JOINED_MSG, /* 511 */
- DLM_CANCEL_JOIN_MSG, /* 512 */
- DLM_EXIT_DOMAIN_MSG, /* 513 */
- DLM_MASTER_REQUERY_MSG, /* 514 */
- DLM_LOCK_REQUEST_MSG, /* 515 */
- DLM_RECO_DATA_DONE_MSG, /* 516 */
- DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG, /* 518 */
- DLM_QUERY_REGION, /* 519 */
- DLM_QUERY_NODEINFO, /* 520 */
+ DLM_MASTER_REQUEST_MSG = 500,
+ DLM_UNUSED_MSG1 = 501,
+ DLM_ASSERT_MASTER_MSG = 502,
+ DLM_CREATE_LOCK_MSG = 503,
+ DLM_CONVERT_LOCK_MSG = 504,
+ DLM_PROXY_AST_MSG = 505,
+ DLM_UNLOCK_LOCK_MSG = 506,
+ DLM_DEREF_LOCKRES_MSG = 507,
+ DLM_MIGRATE_REQUEST_MSG = 508,
+ DLM_MIG_LOCKRES_MSG = 509,
+ DLM_QUERY_JOIN_MSG = 510,
+ DLM_ASSERT_JOINED_MSG = 511,
+ DLM_CANCEL_JOIN_MSG = 512,
+ DLM_EXIT_DOMAIN_MSG = 513,
+ DLM_MASTER_REQUERY_MSG = 514,
+ DLM_LOCK_REQUEST_MSG = 515,
+ DLM_RECO_DATA_DONE_MSG = 516,
+ DLM_BEGIN_RECO_MSG = 517,
+ DLM_FINALIZE_RECO_MSG = 518,
+ DLM_QUERY_REGION = 519,
+ DLM_QUERY_NODEINFO = 520,
};
struct dlm_reco_node_data
@@ -460,19 +460,19 @@ struct dlm_reco_node_data
enum {
DLM_RECO_NODE_DATA_DEAD = -1,
DLM_RECO_NODE_DATA_INIT = 0,
- DLM_RECO_NODE_DATA_REQUESTING,
- DLM_RECO_NODE_DATA_REQUESTED,
- DLM_RECO_NODE_DATA_RECEIVING,
- DLM_RECO_NODE_DATA_DONE,
- DLM_RECO_NODE_DATA_FINALIZE_SENT,
+ DLM_RECO_NODE_DATA_REQUESTING = 1,
+ DLM_RECO_NODE_DATA_REQUESTED = 2,
+ DLM_RECO_NODE_DATA_RECEIVING = 3,
+ DLM_RECO_NODE_DATA_DONE = 4,
+ DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
};
enum {
DLM_MASTER_RESP_NO = 0,
- DLM_MASTER_RESP_YES,
- DLM_MASTER_RESP_MAYBE,
- DLM_MASTER_RESP_ERROR
+ DLM_MASTER_RESP_YES = 1,
+ DLM_MASTER_RESP_MAYBE = 2,
+ DLM_MASTER_RESP_ERROR = 3,
};
@@ -649,9 +649,9 @@ struct dlm_proxy_ast
#define DLM_MOD_KEY (0x666c6172)
enum dlm_query_join_response_code {
JOIN_DISALLOW = 0,
- JOIN_OK,
- JOIN_OK_NO_MAP,
- JOIN_PROTOCOL_MISMATCH,
+ JOIN_OK = 1,
+ JOIN_OK_NO_MAP = 2,
+ JOIN_PROTOCOL_MISMATCH = 3,
};
struct dlm_query_join_packet {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 272ec86..04a32be 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
kref_get(&dc->debug_refcnt);
}
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
{
- struct debug_buffer *db = NULL;
-
- db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
- if (!db)
- goto bail;
-
- db->len = PAGE_SIZE;
- db->buf = kmalloc(db->len, GFP_KERNEL);
- if (!db->buf)
- goto bail;
-
- return db;
-bail:
- kfree(db);
- return NULL;
-}
-
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct debug_buffer *db = file->private_data;
-
- return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
- struct debug_buffer *db = file->private_data;
- loff_t new = -1;
-
- switch (whence) {
- case 0:
- new = off;
- break;
- case 1:
- new = file->f_pos + off;
- break;
- }
-
- if (new < 0 || new > db->len)
- return -EINVAL;
-
- return (file->f_pos = new);
+ free_page((unsigned long)file->private_data);
+ return 0;
}
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- struct debug_buffer *db = file->private_data;
-
- if (db)
- kfree(db->buf);
- kfree(db);
-
- return 0;
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
}
/* end - util funcs */
/* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_lock_resource *res;
int out = 0;
unsigned long total = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping Purgelist for Domain: %s\n", dlm->name);
spin_lock(&dlm->spinlock);
list_for_each_entry(res, &dlm->purge_list, purge) {
++total;
- if (db->len - out < 100)
+ if (len - out < 100)
continue;
spin_lock(&res->spinlock);
out += stringify_lockname(res->lockname.name,
res->lockname.len,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\t%ld\n",
(jiffies - res->last_used)/HZ);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
- out += snprintf(db->buf + out, db->len - out,
- "Total on list: %ld\n", total);
+ out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
return out;
}
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_purgelist_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_purgelist_print(dlm, db);
+ i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -480,14 +434,14 @@ bail:
static const struct file_operations debug_purgelist_fops = {
.open = debug_purgelist_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - purge list funcs */
/* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_master_list_entry *mle;
struct hlist_head *bucket;
@@ -495,7 +449,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
int i, out = 0;
unsigned long total = 0, longest = 0, bucket_count = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
spin_lock(&dlm->master_lock);
@@ -506,16 +460,16 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
master_hash_node);
++total;
++bucket_count;
- if (db->len - out < 200)
+ if (len - out < 200)
continue;
- out += dump_mle(mle, db->buf + out, db->len - out);
+ out += dump_mle(mle, buf + out, len - out);
}
longest = max(longest, bucket_count);
bucket_count = 0;
}
spin_unlock(&dlm->master_lock);
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Total: %ld, Longest: %ld\n", total, longest);
return out;
}
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_mle_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_mle_print(dlm, db);
+ i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -540,9 +494,9 @@ bail:
static const struct file_operations debug_mle_fops = {
.open = debug_mle_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
/* end - debug lockres funcs */
/* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
{
int out = 0;
struct dlm_reco_node_data *node;
@@ -781,35 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
}
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Domain: %s Key: 0x%08x Protocol: %d.%d\n",
dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Thread Pid: %d Node: %d State: %s\n",
- dlm->dlm_thread_task->pid, dlm->node_num, state);
+ task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
/* Number of Joins: xxx Joining Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Number of Joins: %d Joining Node: %d\n",
dlm->num_joins, dlm->joining_node);
/* Domain Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+ out += snprintf(buf + out, len - out, "Domain Map: ");
out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+ out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Lock Resources: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lock Resources: %d (%d)\n",
atomic_read(&dlm->res_cur_count),
atomic_read(&dlm->res_tot_count));
@@ -821,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
cur_mles += atomic_read(&dlm->mle_cur_count[i]);
/* MLEs: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"MLEs: %d (%d)\n", cur_mles, tot_mles);
/* Blocking: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Blocking: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
/* Mastery: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Mastery: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
/* Migration: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Migration: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
/* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lists: Dirty=%s Purge=%s PendingASTs=%s "
"PendingBASTs=%s\n",
(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -852,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
/* Purge Count: xxx Refs: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
atomic_read(&dlm->dlm_refs.refcount));
/* Dead Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dead Node: %d\n", dlm->reco.dead_node);
/* What about DLM_RECO_STATE_FINALIZE? */
@@ -867,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "INACTIVE";
/* Recovery Pid: xxxx Master: xxx State: xxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Recovery Pid: %d Master: %d State: %s\n",
- dlm->dlm_reco_thread_task->pid,
+ task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.new_master, state);
/* Recovery Map: xx xx */
- out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+ out += snprintf(buf + out, len - out, "Recovery Map: ");
out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Recovery Node State: */
- out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+ out += snprintf(buf + out, len - out, "Recovery Node State:\n");
list_for_each_entry(node, &dlm->reco.node_data, list) {
switch (node->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -907,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "BAD";
break;
}
- out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+ out += snprintf(buf + out, len - out, "\t%u - %s\n",
node->node_num, state);
}
@@ -919,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_state_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db = NULL;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_state_print(dlm, db);
+ i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -936,9 +890,9 @@ bail:
static const struct file_operations debug_state_fops = {
.open = debug_state_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug state funcs */
@@ -1002,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
if (dc) {
- if (dc->debug_purgelist_dentry)
- debugfs_remove(dc->debug_purgelist_dentry);
- if (dc->debug_mle_dentry)
- debugfs_remove(dc->debug_mle_dentry);
- if (dc->debug_lockres_dentry)
- debugfs_remove(dc->debug_lockres_dentry);
- if (dc->debug_state_dentry)
- debugfs_remove(dc->debug_state_dentry);
+ debugfs_remove(dc->debug_purgelist_dentry);
+ debugfs_remove(dc->debug_mle_dentry);
+ debugfs_remove(dc->debug_lockres_dentry);
+ debugfs_remove(dc->debug_state_dentry);
dlm_debug_put(dc);
}
}
@@ -1040,8 +990,7 @@ bail:
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- if (dlm->dlm_debugfs_subroot)
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
@@ -1057,7 +1006,6 @@ int dlm_create_debugfs_root(void)
void dlm_destroy_debugfs_root(void)
{
- if (dlm_debugfs_root)
- debugfs_remove(dlm_debugfs_root);
+ debugfs_remove(dlm_debugfs_root);
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d2..1f27c48 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
struct dentry *debug_purgelist_dentry;
};
-struct debug_buffer {
- int len;
- char *buf;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index cc2aaa9..7e38a07 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -460,8 +460,6 @@ redo_bucket:
}
cond_resched_lock(&dlm->spinlock);
num += n;
- mlog(0, "%s: touched %d lockreses in bucket %d "
- "(tot=%d)\n", dlm->name, n, i, num);
}
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -1661,8 +1659,8 @@ bail:
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
@@ -1674,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369..7009292 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
+ if (!dlm_lock_compatible(tmplock->ml.convert_type,
+ lock->ml.type))
+ return 0;
}
return 1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf..1d6d1d2 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
if (list_empty(&res->purge)) {
- mlog(0, "putting lockres %.*s:%p onto purge list\n",
- res->lockname.len, res->lockname.name, res);
+ mlog(0, "%s: Adding res %.*s to purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
res->last_used = jiffies;
dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
dlm->purge_count++;
}
} else if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
- res->lockname.len, res->lockname.name, res, res->owner);
+ mlog(0, "%s: Removing res %.*s from purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
master = (res->owner == dlm->node_num);
-
- mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
- res->lockname.name, master);
+ mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, master);
if (!master) {
res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, ret);
if (!dlm_is_host_down(ret))
BUG();
}
- mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
- dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
}
if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purgelist, "
- "master = %d\n", res->lockname.len, res->lockname.name,
- res, master);
+ mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+ dlm->name, res->lockname.len, res->lockname.name, master);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
__dlm_print_one_lock_resource(res);
BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
unused = __dlm_lockres_unused(lockres);
if (!unused ||
(lockres->state & DLM_LOCK_RES_MIGRATING)) {
- mlog(0, "lockres %s:%.*s: is in use or "
- "being remastered, used %d, state %d\n",
- dlm->name, lockres->lockname.len,
- lockres->lockname.name, !unused, lockres->state);
+ mlog(0, "%s: res %.*s is in use or being remastered, "
+ "used %d, state %d\n", dlm->name,
+ lockres->lockname.len, lockres->lockname.name,
+ !unused, lockres->state);
list_move_tail(&dlm->purge_list, &lockres->purge);
spin_unlock(&lockres->spinlock);
continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct list_head *head;
int can_grant = 1;
- //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
- //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
- //mlog(0, "shuffle res %.*s\n", res->lockname.len,
- // res->lockname.name);
-
- /* because this function is called with the lockres
+ /*
+ * Because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
- * basts right before queueing them all throughout */
+ * basts right before queueing them all throughout
+ */
assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
converting:
if (list_empty(&res->converting))
goto blocked;
- mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
- res->lockname.name);
+ mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+ res->lockname.len, res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
- mlog(ML_ERROR, "%.*s: converting a lock with no "
- "convert_type!\n", res->lockname.len, res->lockname.name);
+ mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+ dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
head = &res->granted;
@@ -365,9 +356,12 @@ converting:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for converting lock: %.*s, have: %d, "
- "granting: %d, node: %u\n", res->lockname.len,
- res->lockname.name, target->ml.type,
+ mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+ "%d => %d, node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+ target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
- "node: %u\n", res->lockname.len, res->lockname.name,
+ mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type, target->ml.node);
- // target->ml.type is already correct
+ /* target->ml.type is already correct */
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
res->state |= DLM_LOCK_RES_DIRTY;
}
}
+
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
- mlog(0, "starting dlm thread...\n");
+ mlog(0, "Starting dlm_thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
- mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+ mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
- mlog(0, "delivering an ast for this lockres\n");
+ mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.node);
BUG_ON(!lock->ast_pending);
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
- mlog(0, "aha another ast got queued while "
- "we were finishing the last one. will "
- "keep the ast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, AST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->ast_pending = 0;
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
- mlog(0, "delivering a bast for this lockres "
- "(blocked = %d\n", hi);
+ mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+ "blocked %d, node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ hi, lock->ml.node);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
- mlog(0, "aha another bast got queued while "
- "we were finishing the last one. will "
- "keep the bast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, BAST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->bast_pending = 0;
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
- mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
- res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
- res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
- res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+ mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+ " dirty %d\n", dlm->name,
+ !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+ !!(res->state & DLM_LOCK_RES_MIGRATING),
+ !!(res->state & DLM_LOCK_RES_RECOVERING),
+ !!(res->state & DLM_LOCK_RES_DIRTY));
}
BUG_ON(res->owner != dlm->node_num);
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
spin_unlock(&dlm->ast_lock);
- mlog(0, "delaying list shuffling for in-"
- "progress lockres %.*s, state=%d\n",
+ mlog(0, "%s: res %.*s, inprogress, delay list "
+ "shuffle, state %d\n", dlm->name,
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
- mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
- "res=%.*s\n", dlm->name,
- res->lockname.len, res->lockname.name);
-
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
- mlog(0, "throttling dlm_thread\n");
+ mlog(0, "%s: Throttling dlm thread\n",
+ dlm->name);
break;
}
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6..30c5231 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1017,8 +1017,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
* An error return must mean that no cluster locks
* were held on function exit.
*/
- if (oi1->ip_blkno != oi2->ip_blkno)
+ if (oi1->ip_blkno != oi2->ip_blkno) {
ocfs2_inode_unlock(inode2, 1);
+ brelse(*bh2);
+ *bh2 = NULL;
+ }
if (status != -ENOENT)
mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 70dd3b1..51cd689 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -420,6 +420,11 @@ struct ocfs2_super
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
+ /*
+ * How many clusters in our truncate log.
+ * It must be protected by osb_tl_inode->i_mutex.
+ */
+ unsigned int truncated_clusters;
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c3..0000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
- wait_queue_head_t waiters;
-} sv_t;
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&sv->waiters, &wait);
- __set_current_state(TASK_UNINTERRUPTIBLE);
- spin_unlock(lock);
-
- schedule();
-
- remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define sv_init(sv,flag,name) \
- init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
- /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
- _sv_wait(sv, lock)
-#define sv_signal(sv) \
- wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
- wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f612..ec7bbb5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
- IO_READ, /* mapping for a read */
- IO_DELAY, /* mapping covers delalloc region */
- IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
- IO_NEW /* just allocated */
-};
/*
* Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
xfs_inode_t *ip = XFS_I(ioend->io_inode);
xfs_fsize_t isize;
- ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
- ASSERT(ioend->io_type != IO_READ);
-
if (unlikely(ioend->io_error))
return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
* We might have to update the on-disk file size after extending
* writes.
*/
- if (ioend->io_type != IO_READ) {
- error = xfs_setfilesize(ioend);
- ASSERT(!error || error == EAGAIN);
- }
+ error = xfs_setfilesize(ioend);
+ ASSERT(!error || error == EAGAIN);
/*
* If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
xfs_map_blocks(
struct inode *inode,
loff_t offset,
- ssize_t count,
struct xfs_bmbt_irec *imap,
- int flags)
+ int type,
+ int nonblocking)
{
- int nmaps = 1;
- int new = 0;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t count = 1 << inode->i_blkbits;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int bmapi_flags = XFS_BMAPI_ENTIRE;
+ int nimaps = 1;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (type == IO_UNWRITTEN)
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ if (nonblocking)
+ return -XFS_ERROR(EAGAIN);
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
- return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ (ip->i_df.if_flags & XFS_IFEXTENTS));
+ ASSERT(offset <= mp->m_maxioffset);
+
+ if (offset + count > mp->m_maxioffset)
+ count = mp->m_maxioffset - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+ bmapi_flags, NULL, 0, imap, &nimaps, NULL);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (error)
+ return -XFS_ERROR(error);
+
+ if (type == IO_DELALLOC &&
+ (!nimaps || isnullstartblock(imap->br_startblock))) {
+ error = xfs_iomap_write_allocate(ip, offset, count, imap);
+ if (!error)
+ trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+ return -XFS_ERROR(error);
+ }
+
+#ifdef DEBUG
+ if (type == IO_UNWRITTEN) {
+ ASSERT(nimaps);
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+ }
+#endif
+ if (nimaps)
+ trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+ return 0;
}
STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC_PLUG : WRITE, bio);
- ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
- bio_put(bio);
}
STATIC struct bio *
xfs_alloc_ioend_bio(
struct buffer_head *bh)
{
- struct bio *bio;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
-
- do {
- bio = bio_alloc(GFP_NOIO, nvecs);
- nvecs >>= 1;
- } while (!bio);
+ struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
ASSERT(bio->bi_private == NULL);
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio_get(bio);
return bio;
}
@@ -470,9 +497,8 @@ xfs_submit_ioend(
/* Pass 1 - start writeback */
do {
next = ioend->io_list;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
xfs_start_buffer_writeback(bh);
- }
} while ((ioend = next) != NULL);
/* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
ASSERT(imap->br_startblock != HOLESTARTBLOCK);
ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
- lock_buffer(bh);
xfs_map_buffer(inode, bh, imap, offset);
- bh->b_bdev = xfs_find_bdev_for_inode(inode);
set_buffer_mapped(bh);
clear_buffer_delay(bh);
clear_buffer_unwritten(bh);
}
/*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
- struct page *page,
- unsigned int pg_offset)
-{
- struct buffer_head *bh, *head;
- int ret = 0;
-
- if (PageWriteback(page))
- return 0;
- if (!PageDirty(page))
- return 0;
- if (!page->mapping)
- return 0;
- if (!page_has_buffers(page))
- return 0;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_uptodate(bh))
- break;
- if (!buffer_mapped(bh))
- break;
- ret += bh->b_size;
- if (ret >= pg_offset)
- break;
- } while ((bh = bh->b_this_page) != head);
-
- return ret;
-}
-
-STATIC size_t
-xfs_probe_cluster(
- struct inode *inode,
- struct page *startpage,
- struct buffer_head *bh,
- struct buffer_head *head)
-{
- struct pagevec pvec;
- pgoff_t tindex, tlast, tloff;
- size_t total = 0;
- int done = 0, i;
-
- /* First sum forwards in this page */
- do {
- if (!buffer_uptodate(bh) || !buffer_mapped(bh))
- return total;
- total += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
-
- /* if we reached the end of the page, sum forwards in following pages */
- tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
- tindex = startpage->index + 1;
-
- /* Prune this back to avoid pathological behavior */
- tloff = min(tlast, startpage->index + 64);
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tloff) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- size_t pg_offset, pg_len = 0;
-
- if (tindex == tlast) {
- pg_offset =
- i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
- if (!pg_offset) {
- done = 1;
- break;
- }
- } else
- pg_offset = PAGE_CACHE_SIZE;
-
- if (page->index == tindex && trylock_page(page)) {
- pg_len = xfs_probe_page(page, pg_offset);
- unlock_page(page);
- }
-
- if (!pg_len) {
- done = 1;
- break;
- }
-
- total += pg_len;
- tindex++;
- }
-
- pagevec_release(&pvec);
- cond_resched();
- }
-
- return total;
-}
-
-/*
* Test if a given page is suitable for writing as part of an unwritten
* or delayed allocate extent.
*/
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
if (buffer_unwritten(bh))
acceptable = (type == IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable = (type == IO_DELAY);
+ acceptable = (type == IO_DELALLOC);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable = (type == IO_NEW);
+ acceptable = (type == IO_OVERWRITE);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
loff_t tindex,
struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
- struct writeback_control *wbc,
- int all_bh)
+ struct writeback_control *wbc)
{
struct buffer_head *bh, *head;
xfs_off_t end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
continue;
}
- if (buffer_unwritten(bh) || buffer_delay(bh)) {
+ if (buffer_unwritten(bh) || buffer_delay(bh) ||
+ buffer_mapped(bh)) {
if (buffer_unwritten(bh))
type = IO_UNWRITTEN;
+ else if (buffer_delay(bh))
+ type = IO_DELALLOC;
else
- type = IO_DELAY;
+ type = IO_OVERWRITE;
if (!xfs_imap_valid(inode, imap, offset)) {
done = 1;
continue;
}
- ASSERT(imap->br_startblock != HOLESTARTBLOCK);
- ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
- xfs_map_at_offset(inode, bh, imap, offset);
+ lock_buffer(bh);
+ if (type != IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, imap, offset);
xfs_add_to_ioend(inode, bh, offset, type,
ioendp, done);
page_dirty--;
count++;
} else {
- type = IO_NEW;
- if (buffer_mapped(bh) && all_bh) {
- lock_buffer(bh);
- xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
- count++;
- page_dirty--;
- } else {
- done = 1;
- }
+ done = 1;
}
} while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
- int all_bh,
pgoff_t tlast)
{
struct pagevec pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
for (i = 0; i < pagevec_count(&pvec); i++) {
done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- imap, ioendp, wbc, all_bh);
+ imap, ioendp, wbc);
if (done)
break;
}
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_is_delayed_page(page, IO_DELAY))
+ if (!xfs_is_delayed_page(page, IO_DELALLOC))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
unsigned int type;
__uint64_t end_offset;
pgoff_t end_index, last_index;
- ssize_t size, len;
- int flags, err, imap_valid = 0, uptodate = 1;
+ ssize_t len;
+ int err, imap_valid = 0, uptodate = 1;
int count = 0;
- int all_bh = 0;
+ int nonblocking = 0;
trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
bh = head = page_buffers(page);
offset = page_offset(page);
- flags = BMAPI_READ;
- type = IO_NEW;
+ type = IO_OVERWRITE;
+
+ if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+ nonblocking = 1;
do {
+ int new_ioend = 0;
+
if (offset >= end_offset)
break;
if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
continue;
}
- if (imap_valid)
- imap_valid = xfs_imap_valid(inode, &imap, offset);
-
- if (buffer_unwritten(bh) || buffer_delay(bh)) {
- int new_ioend = 0;
-
- /*
- * Make sure we don't use a read-only iomap
- */
- if (flags == BMAPI_READ)
- imap_valid = 0;
-
- if (buffer_unwritten(bh)) {
+ if (buffer_unwritten(bh)) {
+ if (type != IO_UNWRITTEN) {
type = IO_UNWRITTEN;
- flags = BMAPI_WRITE | BMAPI_IGNSTATE;
- } else if (buffer_delay(bh)) {
- type = IO_DELAY;
- flags = BMAPI_ALLOCATE;
-
- if (wbc->sync_mode == WB_SYNC_NONE)
- flags |= BMAPI_TRYLOCK;
- }
-
- if (!imap_valid) {
- /*
- * If we didn't have a valid mapping then we
- * need to ensure that we put the new mapping
- * in a new ioend structure. This needs to be
- * done to ensure that the ioends correctly
- * reflect the block mappings at io completion
- * for unwritten extent conversion.
- */
- new_ioend = 1;
- err = xfs_map_blocks(inode, offset, len,
- &imap, flags);
- if (err)
- goto error;
- imap_valid = xfs_imap_valid(inode, &imap,
- offset);
+ imap_valid = 0;
}
- if (imap_valid) {
- xfs_map_at_offset(inode, bh, &imap, offset);
- xfs_add_to_ioend(inode, bh, offset, type,
- &ioend, new_ioend);
- count++;
+ } else if (buffer_delay(bh)) {
+ if (type != IO_DELALLOC) {
+ type = IO_DELALLOC;
+ imap_valid = 0;
}
} else if (buffer_uptodate(bh)) {
- /*
- * we got here because the buffer is already mapped.
- * That means it must already have extents allocated
- * underneath it. Map the extent by reading it.
- */
- if (!imap_valid || flags != BMAPI_READ) {
- flags = BMAPI_READ;
- size = xfs_probe_cluster(inode, page, bh, head);
- err = xfs_map_blocks(inode, offset, size,
- &imap, flags);
- if (err)
- goto error;
- imap_valid = xfs_imap_valid(inode, &imap,
- offset);
+ if (type != IO_OVERWRITE) {
+ type = IO_OVERWRITE;
+ imap_valid = 0;
}
+ } else {
+ if (PageUptodate(page)) {
+ ASSERT(buffer_mapped(bh));
+ imap_valid = 0;
+ }
+ continue;
+ }
+ if (imap_valid)
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ if (!imap_valid) {
/*
- * We set the type to IO_NEW in case we are doing a
- * small write at EOF that is extending the file but
- * without needing an allocation. We need to update the
- * file size on I/O completion in this case so it is
- * the same case as having just allocated a new extent
- * that we are writing into for the first time.
+ * If we didn't have a valid mapping then we need to
+ * put the new mapping into a separate ioend structure.
+ * This ensures non-contiguous extents always have
+ * separate ioends, which is particularly important
+ * for unwritten extent conversion at I/O completion
+ * time.
*/
- type = IO_NEW;
- if (trylock_buffer(bh)) {
- if (imap_valid)
- all_bh = 1;
- xfs_add_to_ioend(inode, bh, offset, type,
- &ioend, !imap_valid);
- count++;
- } else {
- imap_valid = 0;
- }
- } else if (PageUptodate(page)) {
- ASSERT(buffer_mapped(bh));
- imap_valid = 0;
+ new_ioend = 1;
+ err = xfs_map_blocks(inode, offset, &imap, type,
+ nonblocking);
+ if (err)
+ goto error;
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ }
+ if (imap_valid) {
+ lock_buffer(bh);
+ if (type != IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, &imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+ new_ioend);
+ count++;
}
if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
end_index = last_index;
xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
- wbc, all_bh, end_index);
+ wbc, end_index);
}
if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
int create,
int direct)
{
- int flags = create ? BMAPI_WRITE : BMAPI_READ;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int lockmode = 0;
struct xfs_bmbt_irec imap;
+ int nimaps = 1;
xfs_off_t offset;
ssize_t size;
- int nimap = 1;
int new = 0;
- int error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
offset = (xfs_off_t)iblock << inode->i_blkbits;
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
if (!create && direct && offset >= i_size_read(inode))
return 0;
- if (direct && create)
- flags |= BMAPI_DIRECT;
+ if (create) {
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
+ } else {
+ lockmode = xfs_ilock_map_shared(ip);
+ }
+
+ ASSERT(offset <= mp->m_maxioffset);
+ if (offset + size > mp->m_maxioffset)
+ size = mp->m_maxioffset - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
- error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
- &new);
+ error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+ XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
if (error)
- return -error;
- if (nimap == 0)
- return 0;
+ goto out_unlock;
+
+ if (create &&
+ (!nimaps ||
+ (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_startblock == DELAYSTARTBLOCK))) {
+ if (direct) {
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, size, &imap);
+ }
+ if (error)
+ goto out_unlock;
+
+ trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+ } else if (nimaps) {
+ trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ } else {
+ trace_xfs_get_blocks_notfound(ip, offset, size);
+ goto out_unlock;
+ }
+ xfs_iunlock(ip, lockmode);
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
}
return 0;
+
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return -error;
}
int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
ssize_t ret;
if (rw & WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+ iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb..71f721e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+ IO_DIRECT = 0, /* special case for direct I/O ioends */
+ IO_DELALLOC, /* mapping covers delalloc region */
+ IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
+ IO_OVERWRITE, /* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+ { 0, "" }, \
+ { IO_DELALLOC, "delalloc" }, \
+ { IO_UNWRITTEN, "unwritten" }, \
+ { IO_OVERWRITE, "overwrite" }
+
+/*
* xfs_ioend struct manages large extent writes for XFS.
* It can manage several multi-page bio's at once.
*/
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6..92f1f2a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
- .shrink = xfsbufd_wakeup,
- .seeks = DEFAULT_SEEKS,
-};
static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
}
/*
- * Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
*/
+STATIC void
+xfs_buf_lru_add(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (list_empty(&bp->b_lru)) {
+ atomic_inc(&bp->b_hold);
+ list_add_tail(&bp->b_lru, &btp->bt_lru);
+ btp->bt_lru_nr++;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+
+ if (list_empty(&bp->b_lru))
+ return;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (!list_empty(&bp->b_lru)) {
+ list_del_init(&bp->b_lru);
+ btp->bt_lru_nr--;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+ struct xfs_buf *bp)
+{
+ bp->b_flags |= XBF_STALE;
+ atomic_set(&(bp)->b_lru_ref, 0);
+ if (!list_empty(&bp->b_lru)) {
+ struct xfs_buftarg *btp = bp->b_target;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (!list_empty(&bp->b_lru)) {
+ list_del_init(&bp->b_lru);
+ btp->bt_lru_nr--;
+ atomic_dec(&bp->b_hold);
+ }
+ spin_unlock(&btp->bt_lru_lock);
+ }
+ ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
STATIC void
_xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
memset(bp, 0, sizeof(xfs_buf_t));
atomic_set(&bp->b_hold, 1);
+ atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
+ INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
{
trace_xfs_buf_free(bp, _RET_IP_);
+ ASSERT(list_empty(&bp->b_lru));
+
if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
uint i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
__func__, gfp_mask);
XFS_STATS_INC(xb_page_retries);
- xfsbufd_wakeup(NULL, 0, gfp_mask);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
@@ -828,6 +897,7 @@ xfs_buf_rele(
if (!pag) {
ASSERT(!bp->b_relse);
+ ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
xfs_buf_free(bp);
@@ -835,13 +905,19 @@ xfs_buf_rele(
}
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
if (bp->b_relse) {
atomic_inc(&bp->b_hold);
spin_unlock(&pag->pag_buf_lock);
bp->b_relse(bp);
+ } else if (!(bp->b_flags & XBF_STALE) &&
+ atomic_read(&bp->b_lru_ref)) {
+ xfs_buf_lru_add(bp);
+ spin_unlock(&pag->pag_buf_lock);
} else {
+ xfs_buf_lru_del(bp);
ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
*/
/*
- * Wait for any bufs with callbacks that have been submitted but
- * have not yet returned... walk the hash list for the target.
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
*/
void
xfs_wait_buftarg(
struct xfs_buftarg *btp)
{
- struct xfs_perag *pag;
- uint i;
+ struct xfs_buf *bp;
- for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
- pag = xfs_perag_get(btp->bt_mount, i);
- spin_lock(&pag->pag_buf_lock);
- while (rb_first(&pag->pag_buf_tree)) {
- spin_unlock(&pag->pag_buf_lock);
+restart:
+ spin_lock(&btp->bt_lru_lock);
+ while (!list_empty(&btp->bt_lru)) {
+ bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+ if (atomic_read(&bp->b_hold) > 1) {
+ spin_unlock(&btp->bt_lru_lock);
delay(100);
- spin_lock(&pag->pag_buf_lock);
+ goto restart;
}
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
+ /*
+ * clear the LRU reference count so the bufer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ spin_unlock(&btp->bt_lru_lock);
+ xfs_buf_rele(bp);
+ spin_lock(&btp->bt_lru_lock);
}
+ spin_unlock(&btp->bt_lru_lock);
}
-/*
- * buftarg list for delwrite queue processing
- */
-static LIST_HEAD(xfs_buftarg_list);
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
-
-STATIC void
-xfs_register_buftarg(
- xfs_buftarg_t *btp)
+int
+xfs_buftarg_shrink(
+ struct shrinker *shrink,
+ int nr_to_scan,
+ gfp_t mask)
{
- spin_lock(&xfs_buftarg_lock);
- list_add(&btp->bt_list, &xfs_buftarg_list);
- spin_unlock(&xfs_buftarg_lock);
-}
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ struct xfs_buf *bp;
+ LIST_HEAD(dispose);
-STATIC void
-xfs_unregister_buftarg(
- xfs_buftarg_t *btp)
-{
- spin_lock(&xfs_buftarg_lock);
- list_del(&btp->bt_list);
- spin_unlock(&xfs_buftarg_lock);
+ if (!nr_to_scan)
+ return btp->bt_lru_nr;
+
+ spin_lock(&btp->bt_lru_lock);
+ while (!list_empty(&btp->bt_lru)) {
+ if (nr_to_scan-- <= 0)
+ break;
+
+ bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ list_move_tail(&bp->b_lru, &btp->bt_lru);
+ continue;
+ }
+
+ /*
+ * remove the buffer from the LRU now to avoid needing another
+ * lock round trip inside xfs_buf_rele().
+ */
+ list_move(&bp->b_lru, &dispose);
+ btp->bt_lru_nr--;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+
+ while (!list_empty(&dispose)) {
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
+ }
+
+ return btp->bt_lru_nr;
}
void
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
struct xfs_mount *mp,
struct xfs_buftarg *btp)
{
+ unregister_shrinker(&btp->bt_shrinker);
+
xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
iput(btp->bt_mapping->host);
- /* Unregister the buftarg first so that we don't get a
- * wakeup finding a non-existent task
- */
- xfs_unregister_buftarg(btp);
kthread_stop(btp->bt_task);
-
kmem_free(btp);
}
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
xfs_buftarg_t *btp,
const char *fsname)
{
- int error = 0;
-
- INIT_LIST_HEAD(&btp->bt_list);
INIT_LIST_HEAD(&btp->bt_delwrite_queue);
spin_lock_init(&btp->bt_delwrite_lock);
btp->bt_flags = 0;
btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
- if (IS_ERR(btp->bt_task)) {
- error = PTR_ERR(btp->bt_task);
- goto out_error;
- }
- xfs_register_buftarg(btp);
-out_error:
- return error;
+ if (IS_ERR(btp->bt_task))
+ return PTR_ERR(btp->bt_task);
+ return 0;
}
xfs_buftarg_t *
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
+ INIT_LIST_HEAD(&btp->bt_lru);
+ spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
if (xfs_mapping_buftarg(btp, bdev))
goto error;
if (xfs_alloc_delwrite_queue(btp, fsname))
goto error;
+ btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+ btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&btp->bt_shrinker);
return btp;
error:
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
flush_workqueue(queue);
}
-STATIC int
-xfsbufd_wakeup(
- struct shrinker *shrink,
- int priority,
- gfp_t mask)
-{
- xfs_buftarg_t *btp;
-
- spin_lock(&xfs_buftarg_lock);
- list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
- if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
- continue;
- if (list_empty(&btp->bt_delwrite_queue))
- continue;
- set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
- wake_up_process(btp->bt_task);
- }
- spin_unlock(&xfs_buftarg_lock);
- return 0;
-}
-
/*
* Move as many buffers as specified to the supplied list
* idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
if (!xfsconvertd_workqueue)
goto out_destroy_xfsdatad_workqueue;
- register_shrinker(&xfs_buf_shake);
return 0;
out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- unregister_shrinker(&xfs_buf_shake);
destroy_workqueue(xfsconvertd_workqueue);
destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f3..a76c242 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
/* per device delwri queue */
struct task_struct *bt_task;
- struct list_head bt_list;
struct list_head bt_delwrite_queue;
spinlock_t bt_delwrite_lock;
unsigned long bt_flags;
+
+ /* LRU control structures */
+ struct shrinker bt_shrinker;
+ struct list_head bt_lru;
+ spinlock_t bt_lru_lock;
+ unsigned int bt_lru_nr;
} xfs_buftarg_t;
/*
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
xfs_off_t b_file_offset; /* offset in file */
size_t b_buffer_length;/* size of buffer in bytes */
atomic_t b_hold; /* reference count */
+ atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
+ struct list_head b_lru; /* lru list */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0)
+static inline void
+xfs_buf_set_ref(
+ struct xfs_buf *bp,
+ int lru_ref)
+{
+ atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74..fc0114d 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
else
fileid_type = FILEID_INO32_GEN_PARENT;
- /* filesystem may contain 64bit inode numbers */
- if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+ /*
+ * If the the filesystem may contain 64bit inode numbers, we need
+ * to use larger file handles that can represent them.
+ *
+ * While we only allocate inodes that do not fit into 32 bits any
+ * large enough filesystem may contain them, thus the slightly
+ * confusing looking conditional below.
+ */
+ if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+ (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
fileid_type |= XFS_FILEID_TYPE_64FLAG;
/*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd7..0964949 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
#include <kmem.h>
#include <mrlock.h>
-#include <sv.h>
#include <time.h>
#include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964..c51faaa 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -834,8 +834,11 @@ xfsaild_wakeup(
struct xfs_ail *ailp,
xfs_lsn_t threshold_lsn)
{
- ailp->xa_target = threshold_lsn;
- wake_up_process(ailp->xa_task);
+ /* only ever move the target forwards */
+ if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+ ailp->xa_target = threshold_lsn;
+ wake_up_process(ailp->xa_task);
+ }
}
STATIC int
@@ -847,8 +850,17 @@ xfsaild(
long tout = 0; /* milliseconds */
while (!kthread_should_stop()) {
- schedule_timeout_interruptible(tout ?
- msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+ /*
+ * for short sleeps indicating congestion, don't allow us to
+ * get woken early. Otherwise all we do is bang on the AIL lock
+ * without making progress.
+ */
+ if (tout && tout <= 20)
+ __set_current_state(TASK_KILLABLE);
+ else
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(tout ?
+ msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
/* swsusp */
try_to_freeze();
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
*/
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
xfs_inactive(ip);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7c..a02480d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
{
struct inode *inode = VFS_I(ip);
+ ASSERT(rcu_read_lock_held());
+
+ /*
+ * check for stale RCU freed inode
+ *
+ * If the inode has been reallocated, it doesn't matter if it's not in
+ * the AG we are walking - we are walking for writeback, so if it
+ * passes all the "valid inode" checks and is dirty, then we'll write
+ * it back anyway. If it has been reallocated and still being
+ * initialised, the XFS_INEW check below will catch it.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino)
+ goto out_unlock_noent;
+
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock_noent;
+ spin_unlock(&ip->i_flags_lock);
+
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return EFSCORRUPTED;
- /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
- return ENOENT;
-
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
/* inode is valid */
return 0;
+
+out_unlock_noent:
+ spin_unlock(&ip->i_flags_lock);
+ return ENOENT;
}
STATIC int
@@ -98,12 +118,12 @@ restart:
int error = 0;
int i;
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
if (!nr_found) {
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
break;
}
@@ -118,18 +138,26 @@ restart:
batch[i] = NULL;
/*
- * Update the index for the next lookup. Catch overflows
- * into the next AG range which can occur if we have inodes
- * in the last block of the AG and we are currently
- * pointing to the last inode.
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
*/
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1;
}
/* unlock now we've grabbed the inodes. */
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
for (i = 0; i < nr_found; i++) {
if (!batch[i])
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- write_lock(&pag->pag_ici_lock);
+ spin_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
__xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
+ spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
struct xfs_inode *ip,
int flags)
{
+ ASSERT(rcu_read_lock_held());
+
+ /* quick check for stale RCU freed inode */
+ if (!ip->i_ino)
+ return 1;
/*
- * do some unlocked checks first to avoid unnecceary lock traffic.
+ * do some unlocked checks first to avoid unnecessary lock traffic.
* The first is a flush lock check, the second is a already in reclaim
* check. Only do these checks if we are not going to block on locks.
*/
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
* The radix tree lock here protects a thread in xfs_iget from racing
* with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us.
+ *
+ * Due to RCU lookup, we may find inodes that have been freed and only
+ * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
+ * aren't candidates for reclaim at all, so we must check the
+ * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
*/
spin_lock(&ip->i_flags_lock);
- ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
- if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
- /* ignore as it is already under reclaim */
+ if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock);
return 1;
}
@@ -795,12 +833,12 @@ reclaim:
* added to the tree assert that it's been there before to catch
* problems with the inode life time early on.
*/
- write_lock(&pag->pag_ici_lock);
+ spin_lock(&pag->pag_ici_lock);
if (!radix_tree_delete(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
ASSERT(0);
__xfs_inode_clear_reclaim(pag, ip);
- write_unlock(&pag->pag_ici_lock);
+ spin_unlock(&pag->pag_ici_lock);
/*
* Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +902,14 @@ restart:
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;
- write_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
nr_found = radix_tree_gang_lookup_tag(
&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG);
if (!nr_found) {
- write_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
break;
}
@@ -891,14 +929,24 @@ restart:
* occur if we have inodes in the last block of
* the AG and we are currently pointing to the
* last inode.
+ *
+ * Because we may see inodes that are from the
+ * wrong AG due to RCU freeing and
+ * reallocation, only update the index if it
+ * lies in this AG. It was a race that lead us
+ * to see this inode, so another lookup from
+ * the same index will not find it again.
*/
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+ pag->pag_agno)
+ continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1;
}
/* unlock now we've grabbed the inodes. */
- write_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
for (i = 0; i < nr_found; i++) {
if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e9..647af2a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__field(int, curr_res)
__field(int, unit_res)
__field(unsigned int, flags)
- __field(void *, reserve_headq)
- __field(void *, write_headq)
+ __field(int, reserveq)
+ __field(int, writeq)
__field(int, grant_reserve_cycle)
__field(int, grant_reserve_bytes)
__field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res = tic->t_curr_res;
__entry->unit_res = tic->t_unit_res;
__entry->flags = tic->t_flags;
- __entry->reserve_headq = log->l_reserve_headq;
- __entry->write_headq = log->l_write_headq;
- __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
- __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
- __entry->grant_write_cycle = log->l_grant_write_cycle;
- __entry->grant_write_bytes = log->l_grant_write_bytes;
+ __entry->reserveq = list_empty(&log->l_reserveq);
+ __entry->writeq = list_empty(&log->l_writeq);
+ xlog_crack_grant_head(&log->l_grant_reserve_head,
+ &__entry->grant_reserve_cycle,
+ &__entry->grant_reserve_bytes);
+ xlog_crack_grant_head(&log->l_grant_write_head,
+ &__entry->grant_write_cycle,
+ &__entry->grant_write_bytes);
__entry->curr_cycle = log->l_curr_cycle;
__entry->curr_block = log->l_curr_block;
- __entry->tail_lsn = log->l_tail_lsn;
+ __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
),
TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
- "t_unit_res %u t_flags %s reserve_headq 0x%p "
- "write_headq 0x%p grant_reserve_cycle %d "
+ "t_unit_res %u t_flags %s reserveq %s "
+ "writeq %s grant_reserve_cycle %d "
"grant_reserve_bytes %d grant_write_cycle %d "
"grant_write_bytes %d curr_cycle %d curr_block %d "
"tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res,
__entry->unit_res,
__print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
- __entry->reserve_headq,
- __entry->write_headq,
+ __entry->reserveq ? "empty" : "active",
+ __entry->writeq ? "empty" : "active",
__entry->grant_reserve_cycle,
__entry->grant_reserve_bytes,
__entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int flags, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, flags, irec),
+ int type, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, type, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, flags)
+ __field(int, type)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
- __entry->flags = flags;
+ __entry->type = type;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
- "offset 0x%llx count %zd flags %s "
+ "offset 0x%llx count %zd type %s "
"startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__entry->new_size,
__entry->offset,
__entry->count,
- __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+ __print_symbolic(__entry->type, XFS_IO_TYPES),
__entry->startoff,
(__int64_t)__entry->startblock,
__entry->blockcount)
)
#define DEFINE_IOMAP_EVENT(name) \
-DEFINE_EVENT(xfs_iomap_class, name, \
+DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int flags, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, flags, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+ int type, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
TP_PROTO(struct xfs_alloc_arg *args), \
TP_ARGS(args))
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a..d22aa31 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_freelist));
mutex_destroy(&dqp->q_qlock);
- sv_destroy(&dqp->q_pinwait);
kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a..58632cc 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
- rwlock_t pag_ici_lock; /* incore inode lock */
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
struct mutex pag_ici_reclaim_lock; /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc4..fa8723f 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -577,61 +577,58 @@ xfs_alloc_ag_vextent_exact(
xfs_extlen_t rlen; /* length of returned extent */
ASSERT(args->alignment == 1);
+
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
+ args->agno, XFS_BTNUM_BNO);
+
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
* Look for the closest free block <= bno, it must contain bno
* if any free block does.
*/
- if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+ error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+ if (error)
goto error0;
- if (!i) {
- /*
- * Didn't find it, return null.
- */
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- args->agbno = NULLAGBLOCK;
- return 0;
- }
+ if (!i)
+ goto not_found;
+
/*
* Grab the freespace record.
*/
- if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+ error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+ if (error)
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
ASSERT(fbno <= args->agbno);
minend = args->agbno + args->minlen;
maxend = args->agbno + args->maxlen;
fend = fbno + flen;
+
/*
* Give up if the freespace isn't long enough for the minimum request.
*/
- if (fend < minend) {
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- args->agbno = NULLAGBLOCK;
- return 0;
- }
+ if (fend < minend)
+ goto not_found;
+
/*
* End of extent will be smaller of the freespace end and the
* maximal requested end.
- */
- end = XFS_AGBLOCK_MIN(fend, maxend);
- /*
+ *
* Fix the length according to mod and prod if given.
*/
+ end = XFS_AGBLOCK_MIN(fend, maxend);
args->len = end - args->agbno;
xfs_alloc_fix_len(args);
- if (!xfs_alloc_fix_minleft(args)) {
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- return 0;
- }
+ if (!xfs_alloc_fix_minleft(args))
+ goto not_found;
+
rlen = args->len;
ASSERT(args->agbno + rlen <= fend);
end = args->agbno + rlen;
+
/*
* We are allocating agbno for rlen [agbno .. end]
* Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +637,25 @@ xfs_alloc_ag_vextent_exact(
args->agno, XFS_BTNUM_CNT);
ASSERT(args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
- args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+ error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+ args->len, XFSA_FIXUP_BNO_OK);
+ if (error) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
goto error0;
}
+
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_exact_done(args);
args->wasfromfl = 0;
+ trace_xfs_alloc_exact_done(args);
+ return 0;
+
+not_found:
+ /* Didn't find it, return null. */
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_exact_notfound(args);
return 0;
error0:
@@ -659,6 +665,95 @@ error0:
}
/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur **gcur, /* good cursor */
+ struct xfs_btree_cur **scur, /* searching cursor */
+ xfs_agblock_t gdiff, /* difference for search comparison */
+ xfs_agblock_t *sbno, /* extent found by search */
+ xfs_extlen_t *slen,
+ xfs_extlen_t *slena, /* aligned length */
+ int dir) /* 0 = search right, 1 = search left */
+{
+ xfs_agblock_t bno;
+ xfs_agblock_t new;
+ xfs_agblock_t sdiff;
+ int error;
+ int i;
+
+ /* The good extent is perfect, no need to search. */
+ if (!gdiff)
+ goto out_use_good;
+
+ /*
+ * Look until we find a better one, run out of space or run off the end.
+ */
+ do {
+ error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+ args->minlen, &bno, slena);
+
+ /*
+ * The good extent is closer than this one.
+ */
+ if (!dir) {
+ if (bno >= args->agbno + gdiff)
+ goto out_use_good;
+ } else {
+ if (bno <= args->agbno - gdiff)
+ goto out_use_good;
+ }
+
+ /*
+ * Same distance, compare length and pick the best.
+ */
+ if (*slena >= args->minlen) {
+ args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+ xfs_alloc_fix_len(args);
+
+ sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, *sbno,
+ *slen, &new);
+
+ /*
+ * Choose closer size and invalidate other cursor.
+ */
+ if (sdiff < gdiff)
+ goto out_use_search;
+ goto out_use_good;
+ }
+
+ if (!dir)
+ error = xfs_btree_increment(*scur, 0, &i);
+ else
+ error = xfs_btree_decrement(*scur, 0, &i);
+ if (error)
+ goto error0;
+ } while (i);
+
+out_use_good:
+ xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+ *scur = NULL;
+ return 0;
+
+out_use_search:
+ xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+ *gcur = NULL;
+ return 0;
+
+error0:
+ /* caller invalidates cursors */
+ return error;
+}
+
+/*
* Allocate a variable extent near bno in the allocation group agno.
* Extent's length (returned in len) will be between minlen and maxlen,
* and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1020,45 @@ xfs_alloc_ag_vextent_near(
}
}
} while (bno_cur_lt || bno_cur_gt);
+
/*
* Got both cursors still active, need to find better entry.
*/
if (bno_cur_lt && bno_cur_gt) {
- /*
- * Left side is long enough, look for a right side entry.
- */
if (ltlena >= args->minlen) {
/*
- * Fix up the length.
+ * Left side is good, look for a right side entry.
*/
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
- rlen = args->len;
- ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+ ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, ltbno, ltlen, &ltnew);
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_lt, &bno_cur_gt,
+ ltdiff, &gtbno, &gtlen, &gtlena,
+ 0 /* search right */);
+ } else {
+ ASSERT(gtlena >= args->minlen);
+
/*
- * Not perfect.
- */
- if (ltdiff) {
- /*
- * Look until we find a better one, run out of
- * space, or run off the end.
- */
- while (bno_cur_lt && bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(
- bno_cur_gt, &gtbno,
- &gtlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_alloc_compute_aligned(gtbno, gtlen,
- args->alignment, args->minlen,
- &gtbnoa, &gtlena);
- /*
- * The left one is clearly better.
- */
- if (gtbnoa >= args->agbno + ltdiff) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- break;
- }
- /*
- * If we reach a big enough entry,
- * compare the two and pick the best.
- */
- if (gtlena >= args->minlen) {
- args->len =
- XFS_EXTLEN_MIN(gtlena,
- args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- gtdiff = xfs_alloc_compute_diff(
- args->agbno, rlen,
- args->alignment,
- gtbno, gtlen, &gtnew);
- /*
- * Right side is better.
- */
- if (gtdiff < ltdiff) {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- /*
- * Left side is better.
- */
- else {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- break;
- }
- /*
- * Fell off the right end.
- */
- if ((error = xfs_btree_increment(
- bno_cur_gt, 0, &i)))
- goto error0;
- if (!i) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- break;
- }
- }
- }
- /*
- * The left side is perfect, trash the right side.
- */
- else {
- xfs_btree_del_cursor(bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- }
- /*
- * It's the right side that was found first, look left.
- */
- else {
- /*
- * Fix up the length.
+ * Right side is good, look for a left side entry.
*/
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
- rlen = args->len;
- gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+ gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, gtbno, gtlen, &gtnew);
- /*
- * Right side entry isn't perfect.
- */
- if (gtdiff) {
- /*
- * Look until we find a better one, run out of
- * space, or run off the end.
- */
- while (bno_cur_lt && bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(
- bno_cur_lt, &ltbno,
- &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena);
- /*
- * The right one is clearly better.
- */
- if (ltbnoa <= args->agbno - gtdiff) {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- break;
- }
- /*
- * If we reach a big enough entry,
- * compare the two and pick the best.
- */
- if (ltlena >= args->minlen) {
- args->len = XFS_EXTLEN_MIN(
- ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- ltdiff = xfs_alloc_compute_diff(
- args->agbno, rlen,
- args->alignment,
- ltbno, ltlen, &ltnew);
- /*
- * Left side is better.
- */
- if (ltdiff < gtdiff) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- /*
- * Right side is better.
- */
- else {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- break;
- }
- /*
- * Fell off the left end.
- */
- if ((error = xfs_btree_decrement(
- bno_cur_lt, 0, &i)))
- goto error0;
- if (!i) {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- break;
- }
- }
- }
- /*
- * The right side is perfect, trash the left side.
- */
- else {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_gt, &bno_cur_lt,
+ gtdiff, &ltbno, &ltlen, &ltlena,
+ 1 /* search left */);
}
+
+ if (error)
+ goto error0;
}
+
/*
* If we couldn't get anything, give up.
*/
@@ -1130,6 +1067,7 @@ xfs_alloc_ag_vextent_near(
args->agbno = NULLAGBLOCK;
return 0;
}
+
/*
* At this point we have selected a freespace entry, either to the
* left or to the right. If it's on the right, copy all the
@@ -1146,6 +1084,7 @@ xfs_alloc_ag_vextent_near(
j = 1;
} else
j = 0;
+
/*
* Fix up the length and compute the useful address.
*/
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8e..71e90dc2 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
* It didn't all fit, so we have to sort everything on hashval.
*/
sbsize = sf->hdr.count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+ sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
/*
* Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
args.dp = context->dp;
args.whichfork = XFS_ATTR_FORK;
args.valuelen = valuelen;
- args.value = kmem_alloc(valuelen, KM_SLEEP);
+ args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca..2f9e97c 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
return error;
}
ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
+ if (bp)
XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- }
*bpp = bp;
return 0;
}
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
switch (cur->bc_btnum) {
case XFS_BTNUM_BNO:
case XFS_BTNUM_CNT:
- XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
- XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
- XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d..ed2b65f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
#endif
STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
/*
* This returns the number of log iovecs needed to log the
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
* xfs_trans_ail_delete() drops the AIL lock.
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+ xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
} else {
@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
}
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
STATIC void
xfs_buf_do_callbacks(
- xfs_buf_t *bp,
- xfs_log_item_t *lip)
+ struct xfs_buf *bp)
{
- xfs_log_item_t *nlip;
+ struct xfs_log_item *lip;
- while (lip != NULL) {
- nlip = lip->li_bio_list;
+ while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
+ XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
ASSERT(lip->li_cb != NULL);
/*
* Clear the next pointer so we don't have any
@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
*/
lip->li_bio_list = NULL;
lip->li_cb(bp, lip);
- lip = nlip;
}
}
@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
XFS_BUF_SUPER_STALE(bp);
trace_xfs_buf_item_iodone(bp, _RET_IP_);
- xfs_buf_do_callbacks(bp, lip);
+ xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_buf_ioend(bp, 0);
@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
return;
}
- xfs_buf_do_callbacks(bp, lip);
+ xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_buf_ioend(bp, 0);
@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
* We have to unpin the pinned buffers so do the
* callbacks.
*/
- xfs_buf_do_callbacks(bp, lip);
+ xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43..b6ecd20 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
xfs_buf_log_format_t bli_format; /* in-log header */
} xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
- xfs_daddr_t bc_blkno;
- uint bc_len;
- int bc_refcount;
- struct xfs_buf_cancel *bc_next;
-} xfs_buf_cancel_t;
-
void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687..75f2ef6 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
}
/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+ struct xfs_efi_log_item *efip)
+{
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
+
+ if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, &efip->efi_item);
+ xfs_efi_item_free(efip);
+ }
+}
+
+/*
* This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
uint size;
- ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+ ASSERT(atomic_read(&efip->efi_next_extent) ==
+ efip->efi_format.efi_nextents);
efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
}
/*
- * While EFIs cannot really be pinned, the unpin operation is the
- * last place at which the EFI is manipulated during a transaction.
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
- * free the EFI.
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction. If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
*/
STATIC void
xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
int remove)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_ail *ailp = lip->li_ailp;
-
- spin_lock(&ailp->xa_lock);
- if (efip->efi_flags & XFS_EFI_CANCELED) {
- if (remove)
- xfs_trans_del_item(lip);
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, lip);
+ if (remove) {
+ ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+ xfs_trans_del_item(lip);
xfs_efi_item_free(efip);
- } else {
- efip->efi_flags |= XFS_EFI_COMMITTED;
- spin_unlock(&ailp->xa_lock);
+ return;
}
+ __xfs_efi_release(efip);
}
/*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
}
/*
- * The EFI is logged only once and cannot be moved in the log, so
- * simply return the lsn at which it's been logged. The canceled
- * flag is not paid any attention here. Checking for that is delayed
- * until the EFI is unpinned.
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged. For bulk transaction committed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
*/
STATIC xfs_lsn_t
xfs_efi_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
+ set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
return lsn;
}
@@ -230,6 +253,7 @@ xfs_efi_init(
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ atomic_set(&efip->efi_next_extent, 0);
return efip;
}
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
/*
- * This is called by the efd item code below to release references to
- * the given efi item. Each efd calls this with the number of
- * extents that it has logged, and when the sum of these reaches
- * the total number of extents logged by this efi item we can free
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
+ * This is called by the efd item code below to release references to the given
+ * efi item. Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
*/
void
xfs_efi_release(xfs_efi_log_item_t *efip,
uint nextents)
{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
- int extents_left;
-
- ASSERT(efip->efi_next_extent > 0);
- ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-
- spin_lock(&ailp->xa_lock);
- ASSERT(efip->efi_next_extent >= nextents);
- efip->efi_next_extent -= nextents;
- extents_left = efip->efi_next_extent;
- if (extents_left == 0) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
- xfs_efi_item_free(efip);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
+ ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+ if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+ __xfs_efi_release(efip);
}
static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56..375f68e 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
#define XFS_EFI_MAX_FAST_EXTENTS 16
/*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
*/
-#define XFS_EFI_RECOVERED 0x1
-#define XFS_EFI_COMMITTED 0x2
-#define XFS_EFI_CANCELED 0x4
+#define XFS_EFI_RECOVERED 1
+#define XFS_EFI_COMMITTED 2
/*
* This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
- uint efi_flags; /* misc flags */
- uint efi_next_extent;
+ atomic_t efi_next_extent;
+ unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
} xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e..f56d30e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else
mp->m_maxicount = 0;
+ xfs_set_low_space_thresholds(mp);
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3..cb9b6d1 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
/*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+
+/*
* Allocate and initialise an xfs_inode.
*/
STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
+ ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_active, "xfs_iolock_active");
/* initialise the xfs inode */
ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
ip->i_size = 0;
ip->i_new_size = 0;
- /* prevent anyone from using this yet */
- VFS_I(ip)->i_state = I_NEW;
-
return ip;
}
@@ -145,7 +156,18 @@ xfs_inode_free(
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
- call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
+ /*
+ * Because we use RCU freeing we need to ensure the inode always
+ * appears to be reclaimed with an invalid inode number when in the
+ * free state. The ip->i_flags_lock provides the barrier against lookup
+ * races.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
/*
@@ -155,14 +177,29 @@ static int
xfs_iget_cache_hit(
struct xfs_perag *pag,
struct xfs_inode *ip,
+ xfs_ino_t ino,
int flags,
- int lock_flags) __releases(pag->pag_ici_lock)
+ int lock_flags) __releases(RCU)
{
struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
int error;
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ * If the inode has been reallocated and set up, then the inode number
+ * will not match, so check for that, too.
+ */
spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
/*
* If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
error = -inode_init_always(mp->m_super, inode);
if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
goto out_error;
}
- write_lock(&pag->pag_ici_lock);
+ spin_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
ip->i_flags |= XFS_INEW;
__xfs_inode_clear_reclaim_tag(mp, pag, ip);
inode->i_state = I_NEW;
+
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_active, "xfs_iolock_active");
+
spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
+ spin_unlock(&pag->pag_ici_lock);
} else {
/* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
trace_xfs_iget_hit(ip);
}
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
out_error:
spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
return error;
}
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
BUG();
}
- write_lock(&pag->pag_ici_lock);
+ spin_lock(&pag->pag_ici_lock);
/* insert the new inode */
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
ip->i_udquot = ip->i_gdquot = NULL;
xfs_iflags_set(ip, XFS_INEW);
- write_unlock(&pag->pag_ici_lock);
+ spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
*ipp = ip;
return 0;
out_preload_end:
- write_unlock(&pag->pag_ici_lock);
+ spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
if (lock_flags)
xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
xfs_agino_t agino;
/* reject inode numbers outside existing AGs */
- if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL;
/* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
again:
error = 0;
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (ip) {
- error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+ error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
if (error)
goto out_error_or_again;
} else {
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
XFS_STATS_INC(xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a0..be7cf62 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
* around for a while. This helps to keep recently accessed
* meta-data in-core longer.
*/
- XFS_BUF_SET_REF(bp, XFS_INO_REF);
+ xfs_buf_set_ref(bp, XFS_INO_REF);
/*
* Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
*/
for (i = 0; i < ninodes; i++) {
retry:
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
- /* Inode not in memory or stale, nothing to do */
- if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
- read_unlock(&pag->pag_ici_lock);
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
continue;
}
/*
+ * because this is an RCU protected lookup, we could
+ * find a recently freed or even reallocated inode
+ * during the lookup. We need to check under the
+ * i_flags_lock for a valid inode here. Skip it if it
+ * is not valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum + i ||
+ __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
* Don't try to lock/unlock the current inode, but we
* _cannot_ skip the other inodes that we did not find
* in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
*/
if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
delay(1);
goto retry;
}
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- read_lock(&pag->pag_ici_lock);
+ rcu_read_lock();
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
iq = ilist[i];
if (iq == ip)
continue;
- /* if the inode lies outside this cluster, we're done. */
- if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
- break;
+
+ /*
+ * because this is an RCU protected lookup, we could find a
+ * recently freed or even reallocated inode during the lookup.
+ * We need to check under the i_flags_lock for a valid inode
+ * here. Skip it if it is not valid or the wrong inode.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino ||
+ (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
}
out_free:
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
kmem_free(ilist);
out_put:
xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
* Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
- read_unlock(&pag->pag_ici_lock);
+ rcu_read_unlock();
/*
* Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e..5c95fa8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
+#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
+#define XFS_ISTALE 0x0002 /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW 0x0008 /* inode has just been allocated */
+#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
+#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
/*
* Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
+
/*
* Flags for xfs_itruncate_start().
*/
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c..fd4f398 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
* flushed to disk. It is responsible for removing the inode item
* from the AIL if it has not been re-logged, and unlocking the inode's
* flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
*/
void
xfs_iflush_done(
struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- xfs_inode_t *ip = iip->ili_inode;
+ struct xfs_inode_log_item *iip;
+ struct xfs_log_item *blip;
+ struct xfs_log_item *next;
+ struct xfs_log_item *prev;
struct xfs_ail *ailp = lip->li_ailp;
+ int need_ail = 0;
+
+ /*
+ * Scan the buffer IO completions for other inodes being completed and
+ * attach them to the current inode log item.
+ */
+ blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ prev = NULL;
+ while (blip != NULL) {
+ if (lip->li_cb != xfs_iflush_done) {
+ prev = blip;
+ blip = blip->li_bio_list;
+ continue;
+ }
+
+ /* remove from list */
+ next = blip->li_bio_list;
+ if (!prev) {
+ XFS_BUF_SET_FSPRIVATE(bp, next);
+ } else {
+ prev->li_bio_list = next;
+ }
+
+ /* add to current list */
+ blip->li_bio_list = lip->li_bio_list;
+ lip->li_bio_list = blip;
+
+ /*
+ * while we have the item, do the unlocked check for needing
+ * the AIL lock.
+ */
+ iip = INODE_ITEM(blip);
+ if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+ need_ail++;
+
+ blip = next;
+ }
+
+ /* make sure we capture the state of the initial inode. */
+ iip = INODE_ITEM(lip);
+ if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+ need_ail++;
/*
* We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
* the lock since it's cheaper, and then we recheck while
* holding the lock before removing the inode from the AIL.
*/
- if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+ if (need_ail) {
+ struct xfs_log_item *log_items[need_ail];
+ int i = 0;
spin_lock(&ailp->xa_lock);
- if (lip->li_lsn == iip->ili_flush_lsn) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, lip);
- } else {
- spin_unlock(&ailp->xa_lock);
+ for (blip = lip; blip; blip = blip->li_bio_list) {
+ iip = INODE_ITEM(blip);
+ if (iip->ili_logged &&
+ blip->li_lsn == iip->ili_flush_lsn) {
+ log_items[i++] = blip;
+ }
+ ASSERT(i <= need_ail);
}
+ /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+ xfs_trans_ail_delete_bulk(ailp, log_items, i);
}
- iip->ili_logged = 0;
/*
- * Clear the ili_last_fields bits now that we know that the
- * data corresponding to them is safely on disk.
+ * clean up and unlock the flush lock now we are done. We can clear the
+ * ili_last_fields bits now that we know that the data corresponding to
+ * them is safely on disk.
*/
- iip->ili_last_fields = 0;
+ for (blip = lip; blip; blip = next) {
+ next = blip->li_bio_list;
+ blip->li_bio_list = NULL;
- /*
- * Release the inode's flush lock since we're done with it.
- */
- xfs_ifunlock(ip);
+ iip = INODE_ITEM(blip);
+ iip->ili_logged = 0;
+ iip->ili_last_fields = 0;
+ xfs_ifunlock(iip->ili_inode);
+ }
}
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2057614..55582bd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS 2
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
- int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
- struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *, int *);
-
-int
-xfs_iomap(
- struct xfs_inode *ip,
- xfs_off_t offset,
- ssize_t count,
- int flags,
- struct xfs_bmbt_irec *imap,
- int *nimaps,
- int *new)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb, end_fsb;
- int error = 0;
- int lockmode = 0;
- int bmapi_flags = 0;
-
- ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-
- *new = 0;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-
- switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
- case BMAPI_READ:
- lockmode = xfs_ilock_map_shared(ip);
- bmapi_flags = XFS_BMAPI_ENTIRE;
- break;
- case BMAPI_WRITE:
- lockmode = XFS_ILOCK_EXCL;
- if (flags & BMAPI_IGNSTATE)
- bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
- xfs_ilock(ip, lockmode);
- break;
- case BMAPI_ALLOCATE:
- lockmode = XFS_ILOCK_SHARED;
- bmapi_flags = XFS_BMAPI_ENTIRE;
-
- /* Attempt non-blocking lock */
- if (flags & BMAPI_TRYLOCK) {
- if (!xfs_ilock_nowait(ip, lockmode))
- return XFS_ERROR(EAGAIN);
- } else {
- xfs_ilock(ip, lockmode);
- }
- break;
- default:
- BUG();
- }
-
- ASSERT(offset <= mp->m_maxioffset);
- if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
- count = mp->m_maxioffset - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
- error = xfs_bmapi(NULL, ip, offset_fsb,
- (xfs_filblks_t)(end_fsb - offset_fsb),
- bmapi_flags, NULL, 0, imap,
- nimaps, NULL);
-
- if (error)
- goto out;
-
- switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
- case BMAPI_WRITE:
- /* If we found an extent, return it */
- if (*nimaps &&
- (imap->br_startblock != HOLESTARTBLOCK) &&
- (imap->br_startblock != DELAYSTARTBLOCK)) {
- trace_xfs_iomap_found(ip, offset, count, flags, imap);
- break;
- }
-
- if (flags & BMAPI_DIRECT) {
- error = xfs_iomap_write_direct(ip, offset, count, flags,
- imap, nimaps);
- } else {
- error = xfs_iomap_write_delay(ip, offset, count, flags,
- imap, nimaps);
- }
- if (!error) {
- trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
- }
- *new = 1;
- break;
- case BMAPI_ALLOCATE:
- /* If we found an extent, return it */
- xfs_iunlock(ip, lockmode);
- lockmode = 0;
-
- if (*nimaps && !isnullstartblock(imap->br_startblock)) {
- trace_xfs_iomap_found(ip, offset, count, flags, imap);
- break;
- }
-
- error = xfs_iomap_write_allocate(ip, offset, count,
- imap, nimaps);
- break;
- }
-
- ASSERT(*nimaps <= 1);
-
-out:
- if (lockmode)
- xfs_iunlock(ip, lockmode);
- return XFS_ERROR(error);
-}
-
STATIC int
xfs_iomap_eof_align_last_fsb(
xfs_mount_t *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
return EFSCORRUPTED;
}
-STATIC int
+int
xfs_iomap_write_direct(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
- int flags,
xfs_bmbt_irec_t *imap,
- int *nmaps)
+ int nmaps)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
if (error)
goto error_out;
} else {
- if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+ if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
imap->br_blockcount +
imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip);
bmapi_flag = XFS_BMAPI_WRITE;
- if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+ if (offset < ip->i_size || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
goto error_out;
}
- *nmaps = 1;
return 0;
error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
error1: /* Just cancel transaction */
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- *nmaps = 0; /* nothing set-up here */
error_out:
return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
* If the caller is doing a write at the end of the file, then extend the
* allocation out to the file system's write iosize. We clean up any extra
* space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
*/
STATIC int
xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
- int ioflag,
xfs_bmbt_irec_t *imap,
int nimaps,
int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
xfs_filblks_t count_fsb;
xfs_fsblock_t firstblock;
int n, error, imaps;
+ int found_delalloc = 0;
*prealloc = 0;
if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
return 0;
start_fsb += imap[n].br_blockcount;
count_fsb -= imap[n].br_blockcount;
+
+ if (imap[n].br_startblock == DELAYSTARTBLOCK)
+ found_delalloc = 1;
}
}
- *prealloc = 1;
+ if (!found_delalloc)
+ *prealloc = 1;
return 0;
}
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip)
+{
+ xfs_fsblock_t alloc_blocks = 0;
+
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+ int shift = 0;
+ int64_t freesp;
+
+ alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+ alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+ rounddown_pow_of_two(alloc_blocks));
+
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ freesp = mp->m_sb.sb_fdblocks;
+ if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+ shift++;
+ }
+ if (shift)
+ alloc_blocks >>= shift;
+ }
+
+ if (alloc_blocks < mp->m_writeio_blocks)
+ alloc_blocks = mp->m_writeio_blocks;
+
+ return alloc_blocks;
+}
+
+int
xfs_iomap_write_delay(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
- int ioflag,
- xfs_bmbt_irec_t *ret_imap,
- int *nmaps)
+ xfs_bmbt_irec_t *ret_imap)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
- ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+ imap, XFS_WRITE_IMAPS, &prealloc);
if (error)
return error;
retry:
if (prealloc) {
+ xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
- last_fsb = ioalign + mp->m_writeio_blocks;
+ last_fsb = ioalign + alloc_blocks;
} else {
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
}
@@ -496,22 +426,31 @@ retry:
XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
&nimaps, NULL);
- if (error && (error != ENOSPC))
+ switch (error) {
+ case 0:
+ case ENOSPC:
+ case EDQUOT:
+ break;
+ default:
return XFS_ERROR(error);
+ }
/*
- * If bmapi returned us nothing, and if we didn't get back EDQUOT,
- * then we must have run out of space - flush all other inodes with
- * delalloc blocks and retry without EOF preallocation.
+ * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
+ * ENOSPC, * flush all other inodes with delalloc blocks to free up
+ * some of the excess reserved metadata space. For both cases, retry
+ * without EOF preallocation.
*/
if (nimaps == 0) {
trace_xfs_delalloc_enospc(ip, offset, count);
if (flushed)
- return XFS_ERROR(ENOSPC);
+ return XFS_ERROR(error ? error : ENOSPC);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_flush_inodes(ip);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (error == ENOSPC) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_flush_inodes(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ }
flushed = 1;
error = 0;
@@ -523,8 +462,6 @@ retry:
return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
*ret_imap = imap[0];
- *nmaps = 1;
-
return 0;
}
@@ -538,13 +475,12 @@ retry:
* We no longer bother to look at the incoming map - all we have to
* guarantee is that whatever we allocate fills the required range.
*/
-STATIC int
+int
xfs_iomap_write_allocate(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
- xfs_bmbt_irec_t *imap,
- int *retmap)
+ xfs_bmbt_irec_t *imap)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
int error = 0;
int nres;
- *retmap = 0;
-
/*
* Make sure that the dquots are there.
*/
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- *retmap = 1;
XFS_STATS_INC(xs_xstrat_quick);
return 0;
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a43..8061576 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ (1 << 0) /* read extents */
-#define BMAPI_WRITE (1 << 1) /* create extents */
-#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
-
-/* modifiers */
-#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
-#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
-#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
-#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
-
-#define BMAPI_FLAGS \
- { BMAPI_READ, "READ" }, \
- { BMAPI_WRITE, "WRITE" }, \
- { BMAPI_ALLOCATE, "ALLOCATE" }, \
- { BMAPI_IGNSTATE, "IGNSTATE" }, \
- { BMAPI_DIRECT, "DIRECT" }, \
- { BMAPI_TRYLOCK, "TRYLOCK" }
-
struct xfs_inode;
struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
- struct xfs_bmbt_irec *, int *, int *);
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *);
extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9..0bf24b1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
xfs_buftarg_t *log_target,
xfs_daddr_t blk_offset,
int num_bblks);
-STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int xlog_space_left(struct log *log, atomic64_t *head);
STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
STATIC void xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
/* local functions to manipulate grant head */
STATIC int xlog_grant_log_space(xlog_t *log,
xlog_ticket_t *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t *mp,
+STATIC void xlog_grant_push_ail(struct log *log,
int need_bytes);
STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
#if defined(DEBUG)
STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void xlog_verify_grant_tail(struct log *log);
STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
int count, boolean_t syncing);
STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
xfs_lsn_t tail_lsn);
#else
#define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c,d)
#define xlog_verify_tail_lsn(a,b,c)
#endif
STATIC int xlog_iclogs_empty(xlog_t *log);
-
static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+ struct log *log,
+ atomic64_t *head,
+ int bytes)
{
- if (*qp) {
- tic->t_next = (*qp);
- tic->t_prev = (*qp)->t_prev;
- (*qp)->t_prev->t_next = tic;
- (*qp)->t_prev = tic;
- } else {
- tic->t_prev = tic->t_next = tic;
- *qp = tic;
- }
+ int64_t head_val = atomic64_read(head);
+ int64_t new, old;
- tic->t_flags |= XLOG_TIC_IN_Q;
-}
+ do {
+ int cycle, space;
-static void
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
- if (tic == tic->t_next) {
- *qp = NULL;
- } else {
- *qp = tic->t_next;
- tic->t_next->t_prev = tic->t_prev;
- tic->t_prev->t_next = tic->t_next;
- }
+ xlog_crack_grant_head_val(head_val, &cycle, &space);
- tic->t_next = tic->t_prev = NULL;
- tic->t_flags &= ~XLOG_TIC_IN_Q;
+ space -= bytes;
+ if (space < 0) {
+ space += log->l_logsize;
+ cycle--;
+ }
+
+ old = head_val;
+ new = xlog_assign_grant_head_val(cycle, space);
+ head_val = atomic64_cmpxchg(head, old, new);
+ } while (head_val != old);
}
static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+ struct log *log,
+ atomic64_t *head,
+ int bytes)
{
- log->l_grant_write_bytes -= bytes;
- if (log->l_grant_write_bytes < 0) {
- log->l_grant_write_bytes += log->l_logsize;
- log->l_grant_write_cycle--;
- }
-
- log->l_grant_reserve_bytes -= bytes;
- if ((log)->l_grant_reserve_bytes < 0) {
- log->l_grant_reserve_bytes += log->l_logsize;
- log->l_grant_reserve_cycle--;
- }
+ int64_t head_val = atomic64_read(head);
+ int64_t new, old;
-}
+ do {
+ int tmp;
+ int cycle, space;
-static void
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
- int tmp = log->l_logsize - log->l_grant_write_bytes;
- if (tmp > bytes)
- log->l_grant_write_bytes += bytes;
- else {
- log->l_grant_write_cycle++;
- log->l_grant_write_bytes = bytes - tmp;
- }
-}
+ xlog_crack_grant_head_val(head_val, &cycle, &space);
-static void
-xlog_grant_add_space_reserve(struct log *log, int bytes)
-{
- int tmp = log->l_logsize - log->l_grant_reserve_bytes;
- if (tmp > bytes)
- log->l_grant_reserve_bytes += bytes;
- else {
- log->l_grant_reserve_cycle++;
- log->l_grant_reserve_bytes = bytes - tmp;
- }
-}
+ tmp = log->l_logsize - space;
+ if (tmp > bytes)
+ space += bytes;
+ else {
+ space = bytes - tmp;
+ cycle++;
+ }
-static inline void
-xlog_grant_add_space(struct log *log, int bytes)
-{
- xlog_grant_add_space_write(log, bytes);
- xlog_grant_add_space_reserve(log, bytes);
+ old = head_val;
+ new = xlog_assign_grant_head_val(cycle, space);
+ head_val = atomic64_cmpxchg(head, old, new);
+ } while (head_val != old);
}
static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
trace_xfs_log_reserve(log, internal_ticket);
- xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+ xlog_grant_push_ail(log, internal_ticket->t_unit_res);
retval = xlog_regrant_write_log_space(log, internal_ticket);
} else {
/* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
trace_xfs_log_reserve(log, internal_ticket);
- xlog_grant_push_ail(mp,
+ xlog_grant_push_ail(log,
(internal_ticket->t_unit_res *
internal_ticket->t_cnt));
retval = xlog_grant_log_space(log, internal_ticket);
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
iclog->ic_state == XLOG_STATE_DIRTY)) {
if (!XLOG_FORCED_SHUTDOWN(log)) {
- sv_wait(&iclog->ic_force_wait, PMEM,
- &log->l_icloglock, s);
+ xlog_wait(&iclog->ic_force_wait,
+ &log->l_icloglock);
} else {
spin_unlock(&log->l_icloglock);
}
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
|| iclog->ic_state == XLOG_STATE_DIRTY
|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
- sv_wait(&iclog->ic_force_wait, PMEM,
- &log->l_icloglock, s);
+ xlog_wait(&iclog->ic_force_wait,
+ &log->l_icloglock);
} else {
spin_unlock(&log->l_icloglock);
}
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
{
xlog_ticket_t *tic;
xlog_t *log = mp->m_log;
- int need_bytes, free_bytes, cycle, bytes;
+ int need_bytes, free_bytes;
if (XLOG_FORCED_SHUTDOWN(log))
return;
- if (tail_lsn == 0) {
- /* needed since sync_lsn is 64 bits */
- spin_lock(&log->l_icloglock);
- tail_lsn = log->l_last_sync_lsn;
- spin_unlock(&log->l_icloglock);
- }
-
- spin_lock(&log->l_grant_lock);
+ if (tail_lsn == 0)
+ tail_lsn = atomic64_read(&log->l_last_sync_lsn);
- /* Also an invalid lsn. 1 implies that we aren't passing in a valid
- * tail_lsn.
- */
- if (tail_lsn != 1) {
- log->l_tail_lsn = tail_lsn;
- }
+ /* tail_lsn == 1 implies that we weren't passed a valid value. */
+ if (tail_lsn != 1)
+ atomic64_set(&log->l_tail_lsn, tail_lsn);
- if ((tic = log->l_write_headq)) {
+ if (!list_empty_careful(&log->l_writeq)) {
#ifdef DEBUG
if (log->l_flags & XLOG_ACTIVE_RECOVERY)
panic("Recovery problem");
#endif
- cycle = log->l_grant_write_cycle;
- bytes = log->l_grant_write_bytes;
- free_bytes = xlog_space_left(log, cycle, bytes);
- do {
+ spin_lock(&log->l_grant_write_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+ list_for_each_entry(tic, &log->l_writeq, t_queue) {
ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
if (free_bytes < tic->t_unit_res && tail_lsn != 1)
break;
tail_lsn = 0;
free_bytes -= tic->t_unit_res;
- sv_signal(&tic->t_wait);
- tic = tic->t_next;
- } while (tic != log->l_write_headq);
+ trace_xfs_log_regrant_write_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+ spin_unlock(&log->l_grant_write_lock);
}
- if ((tic = log->l_reserve_headq)) {
+
+ if (!list_empty_careful(&log->l_reserveq)) {
#ifdef DEBUG
if (log->l_flags & XLOG_ACTIVE_RECOVERY)
panic("Recovery problem");
#endif
- cycle = log->l_grant_reserve_cycle;
- bytes = log->l_grant_reserve_bytes;
- free_bytes = xlog_space_left(log, cycle, bytes);
- do {
+ spin_lock(&log->l_grant_reserve_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ list_for_each_entry(tic, &log->l_reserveq, t_queue) {
if (tic->t_flags & XLOG_TIC_PERM_RESERV)
need_bytes = tic->t_unit_res*tic->t_cnt;
else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
break;
tail_lsn = 0;
free_bytes -= need_bytes;
- sv_signal(&tic->t_wait);
- tic = tic->t_next;
- } while (tic != log->l_reserve_headq);
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+ spin_unlock(&log->l_grant_reserve_lock);
}
- spin_unlock(&log->l_grant_lock);
-} /* xfs_log_move_tail */
+}
/*
* Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+ struct xfs_mount *mp)
{
- xfs_lsn_t tail_lsn;
- xlog_t *log = mp->m_log;
+ xfs_lsn_t tail_lsn;
+ struct log *log = mp->m_log;
tail_lsn = xfs_trans_ail_tail(mp->m_ail);
- spin_lock(&log->l_grant_lock);
- if (tail_lsn != 0) {
- log->l_tail_lsn = tail_lsn;
- } else {
- tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
- }
- spin_unlock(&log->l_grant_lock);
+ if (!tail_lsn)
+ tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
-} /* xlog_assign_tail_lsn */
-
+}
/*
* Return the space in the log between the tail and the head. The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
* result is that we return the size of the log as the amount of space left.
*/
STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
-{
- int free_bytes;
- int tail_bytes;
- int tail_cycle;
-
- tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
- tail_cycle = CYCLE_LSN(log->l_tail_lsn);
- if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
- free_bytes = log->l_logsize - (bytes - tail_bytes);
- } else if ((tail_cycle + 1) < cycle) {
+xlog_space_left(
+ struct log *log,
+ atomic64_t *head)
+{
+ int free_bytes;
+ int tail_bytes;
+ int tail_cycle;
+ int head_cycle;
+ int head_bytes;
+
+ xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+ tail_bytes = BBTOB(tail_bytes);
+ if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+ free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+ else if (tail_cycle + 1 < head_cycle)
return 0;
- } else if (tail_cycle < cycle) {
- ASSERT(tail_cycle == (cycle - 1));
- free_bytes = tail_bytes - bytes;
+ else if (tail_cycle < head_cycle) {
+ ASSERT(tail_cycle == (head_cycle - 1));
+ free_bytes = tail_bytes - head_bytes;
} else {
/*
* The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
"xlog_space_left: head behind tail\n"
" tail_cycle = %d, tail_bytes = %d\n"
" GH cycle = %d, GH bytes = %d",
- tail_cycle, tail_bytes, cycle, bytes);
+ tail_cycle, tail_bytes, head_cycle, head_bytes);
ASSERT(0);
free_bytes = log->l_logsize;
}
return free_bytes;
-} /* xlog_space_left */
+}
/*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp,
log->l_flags |= XLOG_ACTIVE_RECOVERY;
log->l_prev_block = -1;
- log->l_tail_lsn = xlog_assign_lsn(1, 0);
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
- log->l_last_sync_lsn = log->l_tail_lsn;
+ xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
- log->l_grant_reserve_cycle = 1;
- log->l_grant_write_cycle = 1;
+ xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
+ xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+ INIT_LIST_HEAD(&log->l_reserveq);
+ INIT_LIST_HEAD(&log->l_writeq);
+ spin_lock_init(&log->l_grant_reserve_lock);
+ spin_lock_init(&log->l_grant_write_lock);
error = EFSCORRUPTED;
if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp,
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
- spin_lock_init(&log->l_grant_lock);
- sv_init(&log->l_flush_wait, 0, "flush_wait");
+ init_waitqueue_head(&log->l_flush_wait);
/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp,
ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
- sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
- sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+ init_waitqueue_head(&iclog->ic_force_wait);
+ init_waitqueue_head(&iclog->ic_write_wait);
iclogp = &iclog->ic_next;
}
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp,
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- if (iclog->ic_bp) {
- sv_destroy(&iclog->ic_force_wait);
- sv_destroy(&iclog->ic_write_wait);
+ if (iclog->ic_bp)
xfs_buf_free(iclog->ic_bp);
- }
kmem_free(iclog);
}
spinlock_destroy(&log->l_icloglock);
- spinlock_destroy(&log->l_grant_lock);
xfs_buf_free(log->l_xbuf);
out_free_log:
kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
* water mark. In this manner, we would be creating a low water mark.
*/
STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
- int need_bytes)
+xlog_grant_push_ail(
+ struct log *log,
+ int need_bytes)
{
- xlog_t *log = mp->m_log; /* pointer to the log */
- xfs_lsn_t tail_lsn; /* lsn of the log tail */
- xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */
- int free_blocks; /* free blocks left to write to */
- int free_bytes; /* free bytes left to write to */
- int threshold_block; /* block in lsn we'd like to be at */
- int threshold_cycle; /* lsn cycle we'd like to be at */
- int free_threshold;
-
- ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-
- spin_lock(&log->l_grant_lock);
- free_bytes = xlog_space_left(log,
- log->l_grant_reserve_cycle,
- log->l_grant_reserve_bytes);
- tail_lsn = log->l_tail_lsn;
- free_blocks = BTOBBT(free_bytes);
-
- /*
- * Set the threshold for the minimum number of free blocks in the
- * log to the maximum of what the caller needs, one quarter of the
- * log, and 256 blocks.
- */
- free_threshold = BTOBB(need_bytes);
- free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
- free_threshold = MAX(free_threshold, 256);
- if (free_blocks < free_threshold) {
- threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
- threshold_cycle = CYCLE_LSN(tail_lsn);
+ xfs_lsn_t threshold_lsn = 0;
+ xfs_lsn_t last_sync_lsn;
+ int free_blocks;
+ int free_bytes;
+ int threshold_block;
+ int threshold_cycle;
+ int free_threshold;
+
+ ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ free_blocks = BTOBBT(free_bytes);
+
+ /*
+ * Set the threshold for the minimum number of free blocks in the
+ * log to the maximum of what the caller needs, one quarter of the
+ * log, and 256 blocks.
+ */
+ free_threshold = BTOBB(need_bytes);
+ free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+ free_threshold = MAX(free_threshold, 256);
+ if (free_blocks >= free_threshold)
+ return;
+
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+ &threshold_block);
+ threshold_block += free_threshold;
if (threshold_block >= log->l_logBBsize) {
- threshold_block -= log->l_logBBsize;
- threshold_cycle += 1;
+ threshold_block -= log->l_logBBsize;
+ threshold_cycle += 1;
}
- threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+ threshold_lsn = xlog_assign_lsn(threshold_cycle,
+ threshold_block);
+ /*
+ * Don't pass in an lsn greater than the lsn of the last
+ * log record known to be on disk. Use a snapshot of the last sync lsn
+ * so that it doesn't change between the compare and the set.
+ */
+ last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+ if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+ threshold_lsn = last_sync_lsn;
- /* Don't pass in an lsn greater than the lsn of the last
- * log record known to be on disk.
+ /*
+ * Get the transaction layer to kick the dirty buffers out to
+ * disk asynchronously. No point in trying to do this if
+ * the filesystem is shutting down.
*/
- if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
- threshold_lsn = log->l_last_sync_lsn;
- }
- spin_unlock(&log->l_grant_lock);
-
- /*
- * Get the transaction layer to kick the dirty buffers out to
- * disk asynchronously. No point in trying to do this if
- * the filesystem is shutting down.
- */
- if (threshold_lsn &&
- !XLOG_FORCED_SHUTDOWN(log))
- xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-} /* xlog_grant_push_ail */
+ if (!XLOG_FORCED_SHUTDOWN(log))
+ xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+}
/*
* The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log,
roundoff < BBTOB(1)));
/* move grant heads by roundoff in sync */
- spin_lock(&log->l_grant_lock);
- xlog_grant_add_space(log, roundoff);
- spin_unlock(&log->l_grant_lock);
+ xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+ xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
/* put cycle number in every block */
xlog_pack_data(log, iclog, roundoff);
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
iclog = log->l_iclog;
for (i=0; i<log->l_iclog_bufs; i++) {
- sv_destroy(&iclog->ic_force_wait);
- sv_destroy(&iclog->ic_write_wait);
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
iclog = next_iclog;
}
spinlock_destroy(&log->l_icloglock);
- spinlock_destroy(&log->l_grant_lock);
xfs_buf_free(log->l_xbuf);
log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
lowest_lsn = xlog_get_lowest_lsn(log);
if (lowest_lsn &&
XFS_LSN_CMP(lowest_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+ be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
iclog = iclog->ic_next;
continue; /* Leave this iclog for
* another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
iclog->ic_state = XLOG_STATE_CALLBACK;
- spin_unlock(&log->l_icloglock);
- /* l_last_sync_lsn field protected by
- * l_grant_lock. Don't worry about iclog's lsn.
- * No one else can be here except us.
+ /*
+ * update the last_sync_lsn before we drop the
+ * icloglock to ensure we are the only one that
+ * can update it.
*/
- spin_lock(&log->l_grant_lock);
- ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- log->l_last_sync_lsn =
- be64_to_cpu(iclog->ic_header.h_lsn);
- spin_unlock(&log->l_grant_lock);
+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+ be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+ atomic64_set(&log->l_last_sync_lsn,
+ be64_to_cpu(iclog->ic_header.h_lsn));
- } else {
- spin_unlock(&log->l_icloglock);
+ } else
ioerrors++;
- }
+
+ spin_unlock(&log->l_icloglock);
/*
* Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
xlog_state_clean_log(log);
/* wake up threads waiting in xfs_log_force() */
- sv_broadcast(&iclog->ic_force_wait);
+ wake_up_all(&iclog->ic_force_wait);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
spin_unlock(&log->l_icloglock);
if (wake)
- sv_broadcast(&log->l_flush_wait);
+ wake_up_all(&log->l_flush_wait);
}
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
* iclog buffer, we wake them all, one will get to do the
* I/O, the others get to wait for the result.
*/
- sv_broadcast(&iclog->ic_write_wait);
+ wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
} /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
XFS_STATS_INC(xs_log_noiclogs);
/* Wait for log writes to have flushed */
- sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+ xlog_wait(&log->l_flush_wait, &log->l_icloglock);
goto restart;
}
@@ -2527,6 +2486,18 @@ restart:
*
* Once a ticket gets put onto the reserveq, it will only return after
* the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
*/
STATIC int
xlog_grant_log_space(xlog_t *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log,
{
int free_bytes;
int need_bytes;
-#ifdef DEBUG
- xfs_lsn_t tail_lsn;
-#endif
-
#ifdef DEBUG
if (log->l_flags & XLOG_ACTIVE_RECOVERY)
panic("grant Recovery problem");
#endif
- /* Is there space or do we need to sleep? */
- spin_lock(&log->l_grant_lock);
-
trace_xfs_log_grant_enter(log, tic);
+ need_bytes = tic->t_unit_res;
+ if (tic->t_flags & XFS_LOG_PERM_RESERV)
+ need_bytes *= tic->t_ocnt;
+
/* something is already sleeping; insert new transaction at end */
- if (log->l_reserve_headq) {
- xlog_ins_ticketq(&log->l_reserve_headq, tic);
+ if (!list_empty_careful(&log->l_reserveq)) {
+ spin_lock(&log->l_grant_reserve_lock);
+ /* recheck the queue now we are locked */
+ if (list_empty(&log->l_reserveq)) {
+ spin_unlock(&log->l_grant_reserve_lock);
+ goto redo;
+ }
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log,
goto error_return;
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
/*
* If we got an error, and the filesystem is shutting down,
* we'll catch it down below. So just continue...
*/
trace_xfs_log_grant_wake1(log, tic);
- spin_lock(&log->l_grant_lock);
}
- if (tic->t_flags & XFS_LOG_PERM_RESERV)
- need_bytes = tic->t_unit_res*tic->t_ocnt;
- else
- need_bytes = tic->t_unit_res;
redo:
if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
+ goto error_return_unlocked;
- free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
- log->l_grant_reserve_bytes);
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
if (free_bytes < need_bytes) {
- if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
- xlog_ins_ticketq(&log->l_reserve_headq, tic);
+ spin_lock(&log->l_grant_reserve_lock);
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
trace_xfs_log_grant_sleep2(log, tic);
- spin_unlock(&log->l_grant_lock);
- xlog_grant_push_ail(log->l_mp, need_bytes);
- spin_lock(&log->l_grant_lock);
-
- XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-
- spin_lock(&log->l_grant_lock);
if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
- trace_xfs_log_grant_wake2(log, tic);
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+ trace_xfs_log_grant_wake2(log, tic);
goto redo;
- } else if (tic->t_flags & XLOG_TIC_IN_Q)
- xlog_del_ticketq(&log->l_reserve_headq, tic);
+ }
- /* we've got enough space */
- xlog_grant_add_space(log, need_bytes);
-#ifdef DEBUG
- tail_lsn = log->l_tail_lsn;
- /*
- * Check to make sure the grant write head didn't just over lap the
- * tail. If the cycles are the same, we can't be overlapping.
- * Otherwise, make sure that the cycles differ by exactly one and
- * check the byte count.
- */
- if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
- ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
- ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+ if (!list_empty(&tic->t_queue)) {
+ spin_lock(&log->l_grant_reserve_lock);
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_reserve_lock);
}
-#endif
+
+ /* we've got enough space */
+ xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+ xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_grant_exit(log, tic);
- xlog_verify_grant_head(log, 1);
- spin_unlock(&log->l_grant_lock);
+ xlog_verify_grant_tail(log);
return 0;
- error_return:
- if (tic->t_flags & XLOG_TIC_IN_Q)
- xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+error_return_unlocked:
+ spin_lock(&log->l_grant_reserve_lock);
+error_return:
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_reserve_lock);
trace_xfs_log_grant_error(log, tic);
/*
@@ -2638,7 +2597,6 @@ redo:
*/
tic->t_curr_res = 0;
tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- spin_unlock(&log->l_grant_lock);
return XFS_ERROR(EIO);
} /* xlog_grant_log_space */
@@ -2646,17 +2604,14 @@ redo:
/*
* Replenish the byte reservation required by moving the grant write head.
*
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
*/
STATIC int
xlog_regrant_write_log_space(xlog_t *log,
xlog_ticket_t *tic)
{
int free_bytes, need_bytes;
- xlog_ticket_t *ntic;
-#ifdef DEBUG
- xfs_lsn_t tail_lsn;
-#endif
tic->t_curr_res = tic->t_unit_res;
xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log,
panic("regrant Recovery problem");
#endif
- spin_lock(&log->l_grant_lock);
-
trace_xfs_log_regrant_write_enter(log, tic);
-
if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
+ goto error_return_unlocked;
/* If there are other waiters on the queue then give them a
* chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log,
* this transaction.
*/
need_bytes = tic->t_unit_res;
- if ((ntic = log->l_write_headq)) {
- free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
- log->l_grant_write_bytes);
- do {
+ if (!list_empty_careful(&log->l_writeq)) {
+ struct xlog_ticket *ntic;
+
+ spin_lock(&log->l_grant_write_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+ list_for_each_entry(ntic, &log->l_writeq, t_queue) {
ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
if (free_bytes < ntic->t_unit_res)
break;
free_bytes -= ntic->t_unit_res;
- sv_signal(&ntic->t_wait);
- ntic = ntic->t_next;
- } while (ntic != log->l_write_headq);
-
- if (ntic != log->l_write_headq) {
- if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
- xlog_ins_ticketq(&log->l_write_headq, tic);
+ wake_up(&ntic->t_wait);
+ }
+ if (ntic != list_first_entry(&log->l_writeq,
+ struct xlog_ticket, t_queue)) {
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_writeq);
trace_xfs_log_regrant_write_sleep1(log, tic);
- spin_unlock(&log->l_grant_lock);
- xlog_grant_push_ail(log->l_mp, need_bytes);
- spin_lock(&log->l_grant_lock);
+ xlog_grant_push_ail(log, need_bytes);
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_wait, PINOD|PLTWAIT,
- &log->l_grant_lock, s);
-
- /* If we're shutting down, this tic is already
- * off the queue */
- spin_lock(&log->l_grant_lock);
- if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
-
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
trace_xfs_log_regrant_write_wake1(log, tic);
- }
+ } else
+ spin_unlock(&log->l_grant_write_lock);
}
redo:
if (XLOG_FORCED_SHUTDOWN(log))
- goto error_return;
+ goto error_return_unlocked;
- free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
- log->l_grant_write_bytes);
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
if (free_bytes < need_bytes) {
- if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
- xlog_ins_ticketq(&log->l_write_headq, tic);
- spin_unlock(&log->l_grant_lock);
- xlog_grant_push_ail(log->l_mp, need_bytes);
- spin_lock(&log->l_grant_lock);
-
- XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_regrant_write_sleep2(log, tic);
-
- sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+ spin_lock(&log->l_grant_write_lock);
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_writeq);
- /* If we're shutting down, this tic is already off the queue */
- spin_lock(&log->l_grant_lock);
if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_regrant_write_sleep2(log, tic);
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
trace_xfs_log_regrant_write_wake2(log, tic);
goto redo;
- } else if (tic->t_flags & XLOG_TIC_IN_Q)
- xlog_del_ticketq(&log->l_write_headq, tic);
+ }
- /* we've got enough space */
- xlog_grant_add_space_write(log, need_bytes);
-#ifdef DEBUG
- tail_lsn = log->l_tail_lsn;
- if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
- ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
- ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+ if (!list_empty(&tic->t_queue)) {
+ spin_lock(&log->l_grant_write_lock);
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_write_lock);
}
-#endif
+ /* we've got enough space */
+ xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
trace_xfs_log_regrant_write_exit(log, tic);
-
- xlog_verify_grant_head(log, 1);
- spin_unlock(&log->l_grant_lock);
+ xlog_verify_grant_tail(log);
return 0;
+ error_return_unlocked:
+ spin_lock(&log->l_grant_write_lock);
error_return:
- if (tic->t_flags & XLOG_TIC_IN_Q)
- xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_write_lock);
trace_xfs_log_regrant_write_error(log, tic);
/*
@@ -2778,7 +2714,6 @@ redo:
*/
tic->t_curr_res = 0;
tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
- spin_unlock(&log->l_grant_lock);
return XFS_ERROR(EIO);
} /* xlog_regrant_write_log_space */
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- spin_lock(&log->l_grant_lock);
- xlog_grant_sub_space(log, ticket->t_curr_res);
+ xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+ ticket->t_curr_res);
+ xlog_grant_sub_space(log, &log->l_grant_write_head,
+ ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
trace_xfs_log_regrant_reserve_sub(log, ticket);
- xlog_verify_grant_head(log, 1);
-
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0) {
- spin_unlock(&log->l_grant_lock);
+ if (ticket->t_cnt > 0)
return;
- }
- xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+ xlog_grant_add_space(log, &log->l_grant_reserve_head,
+ ticket->t_unit_res);
trace_xfs_log_regrant_reserve_exit(log, ticket);
- xlog_verify_grant_head(log, 0);
- spin_unlock(&log->l_grant_lock);
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
} /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
xlog_ungrant_log_space(xlog_t *log,
xlog_ticket_t *ticket)
{
+ int bytes;
+
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- spin_lock(&log->l_grant_lock);
trace_xfs_log_ungrant_enter(log, ticket);
-
- xlog_grant_sub_space(log, ticket->t_curr_res);
-
trace_xfs_log_ungrant_sub(log, ticket);
- /* If this is a permanent reservation ticket, we may be able to free
+ /*
+ * If this is a permanent reservation ticket, we may be able to free
* up more space based on the remaining count.
*/
+ bytes = ticket->t_curr_res;
if (ticket->t_cnt > 0) {
ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
- xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+ bytes += ticket->t_unit_res*ticket->t_cnt;
}
+ xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+ xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+
trace_xfs_log_ungrant_exit(log, ticket);
- xlog_verify_grant_head(log, 1);
- spin_unlock(&log->l_grant_lock);
xfs_log_move_tail(log->l_mp, 1);
} /* xlog_ungrant_log_space */
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
/* update tail before writing to iclog */
- xlog_assign_tail_lsn(log->l_mp);
+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
sync++;
iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
- xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
/* cycle incremented when incrementing curr_block */
}
spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
return XFS_ERROR(EIO);
}
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
* only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_prev->ic_write_wait,
- PSWP, &log->l_icloglock, s);
+ xlog_wait(&iclog->ic_prev->ic_write_wait,
+ &log->l_icloglock);
if (log_flushed)
*log_flushed = 1;
already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
return XFS_ERROR(EIO);
}
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
* only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
xlog_ticket_t *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
- if (atomic_dec_and_test(&ticket->t_ref)) {
- sv_destroy(&ticket->t_wait);
+ if (atomic_dec_and_test(&ticket->t_ref))
kmem_zone_free(xfs_log_ticket_zone, ticket);
- }
}
xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
}
atomic_set(&tic->t_ref, 1);
+ INIT_LIST_HEAD(&tic->t_queue);
tic->t_unit_res = unit_bytes;
tic->t_curr_res = unit_bytes;
tic->t_cnt = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
tic->t_trans_type = 0;
if (xflags & XFS_LOG_PERM_RESERV)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+ init_waitqueue_head(&tic->t_wait);
xlog_tic_reset_res(tic);
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
}
STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+ struct log *log)
{
- if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
- if (equals)
- ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
- else
- ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
- } else {
- ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
- ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
- }
-} /* xlog_verify_grant_head */
+ int tail_cycle, tail_blocks;
+ int cycle, space;
+
+ /*
+ * Check to make sure the grant write head didn't just over lap the
+ * tail. If the cycles are the same, we can't be overlapping.
+ * Otherwise, make sure that the cycles differ by exactly one and
+ * check the byte count.
+ */
+ xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+ if (tail_cycle != cycle) {
+ ASSERT(cycle - 1 == tail_cycle);
+ ASSERT(space <= BBTOB(tail_blocks));
+ }
+}
/* check if it will fit */
STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
xlog_cil_force(log);
/*
- * We must hold both the GRANT lock and the LOG lock,
- * before we mark the filesystem SHUTDOWN and wake
- * everybody up to tell the bad news.
+ * mark the filesystem and the as in a shutdown state and wake
+ * everybody up to tell them the bad news.
*/
spin_lock(&log->l_icloglock);
- spin_lock(&log->l_grant_lock);
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
if (mp->m_sb_bp)
XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
spin_unlock(&log->l_icloglock);
/*
- * We don't want anybody waiting for log reservations
- * after this. That means we have to wake up everybody
- * queued up on reserve_headq as well as write_headq.
- * In addition, we make sure in xlog_{re}grant_log_space
- * that we don't enqueue anything once the SHUTDOWN flag
- * is set, and this action is protected by the GRANTLOCK.
+ * We don't want anybody waiting for log reservations after this. That
+ * means we have to wake up everybody queued up on reserveq as well as
+ * writeq. In addition, we make sure in xlog_{re}grant_log_space that
+ * we don't enqueue anything once the SHUTDOWN flag is set, and this
+ * action is protected by the grant locks.
*/
- if ((tic = log->l_reserve_headq)) {
- do {
- sv_signal(&tic->t_wait);
- tic = tic->t_next;
- } while (tic != log->l_reserve_headq);
- }
-
- if ((tic = log->l_write_headq)) {
- do {
- sv_signal(&tic->t_wait);
- tic = tic->t_next;
- } while (tic != log->l_write_headq);
- }
- spin_unlock(&log->l_grant_lock);
+ spin_lock(&log->l_grant_reserve_lock);
+ list_for_each_entry(tic, &log->l_reserveq, t_queue)
+ wake_up(&tic->t_wait);
+ spin_unlock(&log->l_grant_reserve_lock);
+
+ spin_lock(&log->l_grant_write_lock);
+ list_for_each_entry(tic, &log->l_writeq, t_queue)
+ wake_up(&tic->t_wait);
+ spin_unlock(&log->l_grant_write_lock);
if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb..9dc8125 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
init_rwsem(&cil->xc_ctx_lock);
- sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+ init_waitqueue_head(&cil->xc_commit_wait);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
int abort)
{
struct xfs_cil_ctx *ctx = args;
- struct xfs_log_vec *lv;
- int abortflag = abort ? XFS_LI_ABORTED : 0;
struct xfs_busy_extent *busyp, *n;
- /* unpin all the log items */
- for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
- xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
- abortflag);
- }
+ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+ ctx->start_lsn, abort);
list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
- sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
goto restart;
}
}
@@ -592,7 +587,7 @@ restart:
*/
spin_lock(&cil->xc_cil_lock);
ctx->commit_lsn = commit_lsn;
- sv_broadcast(&cil->xc_commit_wait);
+ wake_up_all(&cil->xc_commit_wait);
spin_unlock(&cil->xc_cil_lock);
/* release the hounds! */
@@ -757,7 +752,7 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
- sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
goto restart;
}
if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe0..d5f8be8 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
struct xfs_buf;
struct log;
struct xlog_ticket;
-struct xfs_buf_cancel;
struct xfs_mount;
/*
@@ -54,7 +53,6 @@ struct xfs_mount;
BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-
static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
{
return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
*/
#define XLOG_TIC_INITED 0x1 /* has been initialized */
#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
-#define XLOG_TIC_IN_Q 0x4
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
- { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
- { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
+ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
#endif /* __KERNEL__ */
@@ -244,9 +240,8 @@ typedef struct xlog_res {
} xlog_res_t;
typedef struct xlog_ticket {
- sv_t t_wait; /* ticket wait queue : 20 */
- struct xlog_ticket *t_next; /* :4|8 */
- struct xlog_ticket *t_prev; /* :4|8 */
+ wait_queue_head_t t_wait; /* ticket wait queue */
+ struct list_head t_queue; /* reserve/write queue */
xlog_tid_t t_tid; /* transaction identifier : 4 */
atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
* and move everything else out to subsequent cachelines.
*/
typedef struct xlog_in_core {
- sv_t ic_force_wait;
- sv_t ic_write_wait;
+ wait_queue_head_t ic_force_wait;
+ wait_queue_head_t ic_write_wait;
struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev;
struct xfs_buf *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
struct xfs_cil_ctx *xc_ctx;
struct rw_semaphore xc_ctx_lock;
struct list_head xc_committing;
- sv_t xc_commit_wait;
+ wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
};
@@ -491,7 +486,7 @@ typedef struct log {
struct xfs_buftarg *l_targ; /* buftarg of log */
uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
- struct xfs_buf_cancel **l_buf_cancel_table;
+ struct list_head *l_buf_cancel_table;
int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
int l_logBBsize; /* size of log in BB chunks */
/* The following block of fields are changed while holding icloglock */
- sv_t l_flush_wait ____cacheline_aligned_in_smp;
+ wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
/* waiting for iclog flush */
int l_covered_state;/* state of "covering disk
* log entries" */
xlog_in_core_t *l_iclog; /* head log queue */
spinlock_t l_icloglock; /* grab to change iclog state */
- xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
- * buffers */
- xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
int l_curr_cycle; /* Cycle number of log writes */
int l_prev_cycle; /* Cycle number before last
* block increment */
int l_curr_block; /* current logical log block */
int l_prev_block; /* previous logical log block */
- /* The following block of fields are changed while holding grant_lock */
- spinlock_t l_grant_lock ____cacheline_aligned_in_smp;
- xlog_ticket_t *l_reserve_headq;
- xlog_ticket_t *l_write_headq;
- int l_grant_reserve_cycle;
- int l_grant_reserve_bytes;
- int l_grant_write_cycle;
- int l_grant_write_bytes;
+ /*
+ * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+ * read without needing to hold specific locks. To avoid operations
+ * contending with other hot objects, place each of them on a separate
+ * cacheline.
+ */
+ /* lsn of last LR on disk */
+ atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
+ /* lsn of 1st LR with unflushed * buffers */
+ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
+
+ /*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+ spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
+ struct list_head l_reserveq;
+ atomic64_t l_grant_reserve_head;
+
+ spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
+ struct list_head l_writeq;
+ atomic64_t l_grant_write_head;
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
} xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+ ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
/* common routines */
@@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
xlog_in_core_t **commit_iclog, uint flags);
/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+ xfs_lsn_t val = atomic64_read(lsn);
+
+ *cycle = CYCLE_LSN(val);
+ *block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+ atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+ *cycle = val >> 32;
+ *space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+ xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+ return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+ atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
* Committed Item List interfaces
*/
int xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
*/
#define XLOG_UNMOUNT_REC_TYPE (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(wq, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+}
#endif /* __KERNEL__ */
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f9..204d8e5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
#endif
/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+ xfs_daddr_t bc_blkno;
+ uint bc_len;
+ int bc_refcount;
+ struct list_head bc_list;
+};
+
+/*
* Sector aligned buffer routines for buffer create/read/write/access
*/
@@ -925,12 +936,12 @@ xlog_find_tail(
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
if (found == 2)
log->l_curr_cycle++;
- log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
- log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
- log->l_grant_reserve_cycle = log->l_curr_cycle;
- log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
- log->l_grant_write_cycle = log->l_curr_cycle;
- log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
/*
* Look for unmount record. If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
}
after_umount_blk = (i + hblks + (int)
BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
- tail_lsn = log->l_tail_lsn;
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
* log records will point recovery to after the
* current unmount record.
*/
- log->l_tail_lsn =
- xlog_assign_lsn(log->l_curr_cycle,
- after_umount_blk);
- log->l_last_sync_lsn =
- xlog_assign_lsn(log->l_curr_cycle,
- after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_tail_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+ log->l_curr_cycle, after_umount_blk);
*tail_blk = after_umount_blk;
/*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
* record in the table to tell us how many times we expect to see this
* record during the second pass.
*/
-STATIC void
-xlog_recover_do_buffer_pass1(
- xlog_t *log,
- xfs_buf_log_format_t *buf_f)
+STATIC int
+xlog_recover_buffer_pass1(
+ struct log *log,
+ xlog_recover_item_t *item)
{
- xfs_buf_cancel_t *bcp;
- xfs_buf_cancel_t *nextp;
- xfs_buf_cancel_t *prevp;
- xfs_buf_cancel_t **bucket;
- xfs_daddr_t blkno = 0;
- uint len = 0;
- ushort flags = 0;
-
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- len = buf_f->blf_len;
- flags = buf_f->blf_flags;
- break;
- }
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
/*
* If this isn't a cancel buffer item, then just return.
*/
- if (!(flags & XFS_BLF_CANCEL)) {
+ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
trace_xfs_log_recover_buf_not_cancel(log, buf_f);
- return;
- }
-
- /*
- * Insert an xfs_buf_cancel record into the hash table of
- * them. If there is already an identical record, bump
- * its reference count.
- */
- bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
- XLOG_BC_TABLE_SIZE];
- /*
- * If the hash bucket is empty then just insert a new record into
- * the bucket.
- */
- if (*bucket == NULL) {
- bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
- KM_SLEEP);
- bcp->bc_blkno = blkno;
- bcp->bc_len = len;
- bcp->bc_refcount = 1;
- bcp->bc_next = NULL;
- *bucket = bcp;
- return;
+ return 0;
}
/*
- * The hash bucket is not empty, so search for duplicates of our
- * record. If we find one them just bump its refcount. If not
- * then add us at the end of the list.
+ * Insert an xfs_buf_cancel record into the hash table of them.
+ * If there is already an identical record, bump its reference count.
*/
- prevp = NULL;
- nextp = *bucket;
- while (nextp != NULL) {
- if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
- nextp->bc_refcount++;
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == buf_f->blf_blkno &&
+ bcp->bc_len == buf_f->blf_len) {
+ bcp->bc_refcount++;
trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
- return;
+ return 0;
}
- prevp = nextp;
- nextp = nextp->bc_next;
- }
- ASSERT(prevp != NULL);
- bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
- KM_SLEEP);
- bcp->bc_blkno = blkno;
- bcp->bc_len = len;
+ }
+
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+ bcp->bc_blkno = buf_f->blf_blkno;
+ bcp->bc_len = buf_f->blf_len;
bcp->bc_refcount = 1;
- bcp->bc_next = NULL;
- prevp->bc_next = bcp;
+ list_add_tail(&bcp->bc_list, bucket);
+
trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+ return 0;
}
/*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
*/
STATIC int
xlog_check_buffer_cancelled(
- xlog_t *log,
+ struct log *log,
xfs_daddr_t blkno,
uint len,
ushort flags)
{
- xfs_buf_cancel_t *bcp;
- xfs_buf_cancel_t *prevp;
- xfs_buf_cancel_t **bucket;
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
if (log->l_buf_cancel_table == NULL) {
/*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
return 0;
}
- bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
- XLOG_BC_TABLE_SIZE];
- bcp = *bucket;
- if (bcp == NULL) {
- /*
- * There is no corresponding entry in the table built
- * in pass one, so this buffer has not been cancelled.
- */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
- }
-
/*
- * Search for an entry in the buffer cancel table that
- * matches our buffer.
+ * Search for an entry in the cancel table that matches our buffer.
*/
- prevp = NULL;
- while (bcp != NULL) {
- if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
- /*
- * We've go a match, so return 1 so that the
- * recovery of this buffer is cancelled.
- * If this buffer is actually a buffer cancel
- * log item, then decrement the refcount on the
- * one in the table and remove it if this is the
- * last reference.
- */
- if (flags & XFS_BLF_CANCEL) {
- bcp->bc_refcount--;
- if (bcp->bc_refcount == 0) {
- if (prevp == NULL) {
- *bucket = bcp->bc_next;
- } else {
- prevp->bc_next = bcp->bc_next;
- }
- kmem_free(bcp);
- }
- }
- return 1;
- }
- prevp = bcp;
- bcp = bcp->bc_next;
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+ goto found;
}
+
/*
- * We didn't find a corresponding entry in the table, so
- * return 0 so that the buffer is NOT cancelled.
+ * We didn't find a corresponding entry in the table, so return 0 so
+ * that the buffer is NOT cancelled.
*/
ASSERT(!(flags & XFS_BLF_CANCEL));
return 0;
-}
-STATIC int
-xlog_recover_do_buffer_pass2(
- xlog_t *log,
- xfs_buf_log_format_t *buf_f)
-{
- xfs_daddr_t blkno = 0;
- ushort flags = 0;
- uint len = 0;
-
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- flags = buf_f->blf_flags;
- len = buf_f->blf_len;
- break;
+found:
+ /*
+ * We've go a match, so return 1 so that the recovery of this buffer
+ * is cancelled. If this buffer is actually a buffer cancel log
+ * item, then decrement the refcount on the one in the table and
+ * remove it if this is the last reference.
+ */
+ if (flags & XFS_BLF_CANCEL) {
+ if (--bcp->bc_refcount == 0) {
+ list_del(&bcp->bc_list);
+ kmem_free(bcp);
+ }
}
-
- return xlog_check_buffer_cancelled(log, blkno, len, flags);
+ return 1;
}
/*
- * Perform recovery for a buffer full of inodes. In these buffers,
- * the only data which should be recovered is that which corresponds
- * to the di_next_unlinked pointers in the on disk inode structures.
- * The rest of the data for the inodes is always logged through the
- * inodes themselves rather than the inode buffer and is recovered
- * in xlog_recover_do_inode_trans().
+ * Perform recovery for a buffer full of inodes. In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures. The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
*
- * The only time when buffers full of inodes are fully recovered is
- * when the buffer is full of newly allocated inodes. In this case
- * the buffer will not be marked as an inode buffer and so will be
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes. In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
*/
STATIC int
xlog_recover_do_inode_buffer(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
xlog_recover_item_t *item,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
{
int i;
- int item_index;
- int bit;
- int nbits;
- int reg_buf_offset;
- int reg_buf_bytes;
+ int item_index = 0;
+ int bit = 0;
+ int nbits = 0;
+ int reg_buf_offset = 0;
+ int reg_buf_bytes = 0;
int next_unlinked_offset;
int inodes_per_buf;
xfs_agino_t *logged_nextp;
xfs_agino_t *buffer_nextp;
- unsigned int *data_map = NULL;
- unsigned int map_size = 0;
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- data_map = buf_f->blf_data_map;
- map_size = buf_f->blf_map_size;
- break;
- }
- /*
- * Set the variables corresponding to the current region to
- * 0 so that we'll initialize them on the first pass through
- * the loop.
- */
- reg_buf_offset = 0;
- reg_buf_bytes = 0;
- bit = 0;
- nbits = 0;
- item_index = 0;
inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
* the current di_next_unlinked field.
*/
bit += nbits;
- bit = xfs_next_bit(data_map, map_size, bit);
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
/*
* If there are no more logged regions in the
* buffer, then we're done.
*/
- if (bit == -1) {
+ if (bit == -1)
return 0;
- }
- nbits = xfs_contig_bits(data_map, map_size,
- bit);
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
reg_buf_offset = bit << XFS_BLF_SHIFT;
reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
* di_next_unlinked field, then move on to the next
* di_next_unlinked field.
*/
- if (next_unlinked_offset < reg_buf_offset) {
+ if (next_unlinked_offset < reg_buf_offset)
continue;
- }
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
* given buffer. The bitmap in the buf log format structure indicates
* where to place the logged data.
*/
-/*ARGSUSED*/
STATIC void
xlog_recover_do_reg_buffer(
struct xfs_mount *mp,
xlog_recover_item_t *item,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
{
int i;
int bit;
int nbits;
- unsigned int *data_map = NULL;
- unsigned int map_size = 0;
int error;
trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- data_map = buf_f->blf_data_map;
- map_size = buf_f->blf_map_size;
- break;
- }
bit = 0;
i = 1; /* 0 is the buf format structure */
while (1) {
- bit = xfs_next_bit(data_map, map_size, bit);
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
if (bit == -1)
break;
- nbits = xfs_contig_bits(data_map, map_size, bit);
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
* for more details on the implementation of the table of cancel records.
*/
STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+ xlog_recover_item_t *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
int error;
- int cancel;
- xfs_daddr_t blkno;
- int len;
- ushort flags;
uint buf_flags;
- if (pass == XLOG_RECOVER_PASS1) {
- /*
- * In this pass we're only looking for buf items
- * with the XFS_BLF_CANCEL bit set.
- */
- xlog_recover_do_buffer_pass1(log, buf_f);
+ /*
+ * In this pass we only want to recover all the buffers which have
+ * not been cancelled and are not cancellation buffers themselves.
+ */
+ if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
return 0;
- } else {
- /*
- * In this pass we want to recover all the buffers
- * which have not been cancelled and are not
- * cancellation buffers themselves. The routine
- * we call here will tell us whether or not to
- * continue with the replay of this buffer.
- */
- cancel = xlog_recover_do_buffer_pass2(log, buf_f);
- if (cancel) {
- trace_xfs_log_recover_buf_cancel(log, buf_f);
- return 0;
- }
}
+
trace_xfs_log_recover_buf_recover(log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- len = buf_f->blf_len;
- flags = buf_f->blf_flags;
- break;
- default:
- xfs_fs_cmn_err(CE_ALERT, log->l_mp,
- "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
- buf_f->blf_type, log->l_mp->m_logname ?
- log->l_mp->m_logname : "internal");
- XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
- XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- mp = log->l_mp;
buf_flags = XBF_LOCK;
- if (!(flags & XFS_BLF_INODE_BUF))
+ if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
buf_flags |= XBF_MAPPED;
- bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+ bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags);
if (XFS_BUF_ISERROR(bp)) {
- xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
- bp, blkno);
+ xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+ bp, buf_f->blf_blkno);
error = XFS_BUF_GETERROR(bp);
xfs_buf_relse(bp);
return error;
}
error = 0;
- if (flags & XFS_BLF_INODE_BUF) {
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
- } else if (flags &
+ } else if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
} else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
}
STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+ xlog_recover_item_t *item)
{
xfs_inode_log_format_t *in_f;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
xfs_dinode_t *dip;
- xfs_ino_t ino;
int len;
xfs_caddr_t src;
xfs_caddr_t dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
xfs_icdinode_t *dicp;
int need_free = 0;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
-
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
in_f = item->ri_buf[0].i_addr;
} else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
if (error)
goto error;
}
- ino = in_f->ilf_ino;
- mp = log->l_mp;
/*
* Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
- dip, bp, ino);
- XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+ dip, bp, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
- item, ino);
- XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+ item, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
(dicp->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
- item, dip, bp, ino);
+ item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
goto error;
}
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
(dicp->di_format != XFS_DINODE_FMT_BTREE) &&
(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
- item, dip, bp, ino);
+ item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
goto error;
}
}
if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
- item, dip, bp, ino,
+ item, dip, bp, in_f->ilf_ino,
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
error = EFSCORRUPTED;
goto error;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
- item, dip, bp, ino, dicp->di_forkoff);
+ item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
error = EFSCORRUPTED;
goto error;
}
if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
break;
default:
- xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+ xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
ASSERT(0);
xfs_buf_relse(bp);
error = EIO;
@@ -2556,18 +2422,11 @@ error:
* of that type.
*/
STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+ xlog_recover_item_t *item)
{
- xfs_qoff_logformat_t *qoff_f;
-
- if (pass == XLOG_RECOVER_PASS2) {
- return (0);
- }
-
- qoff_f = item->ri_buf[0].i_addr;
+ xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
ASSERT(qoff_f);
/*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
* Recover a dquot record
*/
STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+ xlog_recover_item_t *item)
{
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
struct xfs_disk_dquot *ddq, *recddq;
int error;
xfs_dq_logformat_t *dq_f;
uint type;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
- mp = log->l_mp;
/*
* Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
if ((error = xfs_qm_dqcheck(recddq,
dq_f->qlf_id,
0, XFS_QMOPT_DOWARN,
- "xlog_recover_do_dquot_trans (log copy)"))) {
+ "xlog_recover_dquot_pass2 (log copy)"))) {
return XFS_ERROR(EIO);
}
ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
* minimal initialization then.
*/
if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
- "xlog_recover_do_dquot_trans")) {
+ "xlog_recover_dquot_pass2")) {
xfs_buf_relse(bp);
return XFS_ERROR(EIO);
}
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
* LSN.
*/
STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
xlog_t *log,
xlog_recover_item_t *item,
- xfs_lsn_t lsn,
- int pass)
+ xfs_lsn_t lsn)
{
int error;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_efi_log_item_t *efip;
xfs_efi_log_format_t *efi_formatp;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
-
efi_formatp = item->ri_buf[0].i_addr;
- mp = log->l_mp;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
&(efip->efi_format)))) {
xfs_efi_item_free(efip);
return error;
}
- efip->efi_next_extent = efi_formatp->efi_nextents;
- efip->efi_flags |= XFS_EFI_COMMITTED;
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
spin_lock(&log->l_ailp->xa_lock);
/*
* xfs_trans_ail_update() drops the AIL lock.
*/
- xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
return 0;
}
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
* efd format structure. If we find it, we remove the efi from the
* AIL and free it.
*/
-STATIC void
-xlog_recover_do_efd_trans(
+STATIC int
+xlog_recover_efd_pass2(
xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+ xlog_recover_item_t *item)
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
- if (pass == XLOG_RECOVER_PASS1) {
- return;
- }
-
efd_formatp = item->ri_buf[0].i_addr;
ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
}
xfs_trans_ail_cursor_done(ailp, &cur);
spin_unlock(&ailp->xa_lock);
-}
-
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now. Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
- xlog_t *log,
- xlog_recover_t *trans,
- int pass)
-{
- int error = 0;
- xlog_recover_item_t *item;
-
- error = xlog_recover_reorder_trans(log, trans, pass);
- if (error)
- return error;
-
- list_for_each_entry(item, &trans->r_itemq, ri_list) {
- trace_xfs_log_recover_item_recover(log, trans, item, pass);
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- error = xlog_recover_do_buffer_trans(log, item, pass);
- break;
- case XFS_LI_INODE:
- error = xlog_recover_do_inode_trans(log, item, pass);
- break;
- case XFS_LI_EFI:
- error = xlog_recover_do_efi_trans(log, item,
- trans->r_lsn, pass);
- break;
- case XFS_LI_EFD:
- xlog_recover_do_efd_trans(log, item, pass);
- error = 0;
- break;
- case XFS_LI_DQUOT:
- error = xlog_recover_do_dquot_trans(log, item, pass);
- break;
- case XFS_LI_QUOTAOFF:
- error = xlog_recover_do_quotaoff_trans(log, item,
- pass);
- break;
- default:
- xlog_warn(
- "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
- ASSERT(0);
- error = XFS_ERROR(EIO);
- break;
- }
-
- if (error)
- return error;
- }
return 0;
}
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
*/
STATIC void
xlog_recover_free_trans(
- xlog_recover_t *trans)
+ struct xlog_recover *trans)
{
xlog_recover_item_t *item, *n;
int i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
}
STATIC int
+xlog_recover_commit_pass1(
+ struct log *log,
+ struct xlog_recover *trans,
+ xlog_recover_item_t *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass1(log, item);
+ case XFS_LI_QUOTAOFF:
+ return xlog_recover_quotaoff_pass1(log, item);
+ case XFS_LI_INODE:
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_DQUOT:
+ /* nothing to do in pass 1 */
+ return 0;
+ default:
+ xlog_warn(
+ "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+ ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+ struct log *log,
+ struct xlog_recover *trans,
+ xlog_recover_item_t *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass2(log, item);
+ case XFS_LI_INODE:
+ return xlog_recover_inode_pass2(log, item);
+ case XFS_LI_EFI:
+ return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+ case XFS_LI_EFD:
+ return xlog_recover_efd_pass2(log, item);
+ case XFS_LI_DQUOT:
+ return xlog_recover_dquot_pass2(log, item);
+ case XFS_LI_QUOTAOFF:
+ /* nothing to do in pass2 */
+ return 0;
+ default:
+ xlog_warn(
+ "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+ ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now. Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
xlog_recover_commit_trans(
- xlog_t *log,
- xlog_recover_t *trans,
+ struct log *log,
+ struct xlog_recover *trans,
int pass)
{
- int error;
+ int error = 0;
+ xlog_recover_item_t *item;
hlist_del(&trans->r_list);
- if ((error = xlog_recover_do_trans(log, trans, pass)))
+
+ error = xlog_recover_reorder_trans(log, trans, pass);
+ if (error)
return error;
- xlog_recover_free_trans(trans); /* no error */
+
+ list_for_each_entry(item, &trans->r_itemq, ri_list) {
+ if (pass == XLOG_RECOVER_PASS1)
+ error = xlog_recover_commit_pass1(log, trans, item);
+ else
+ error = xlog_recover_commit_pass2(log, trans, item);
+ if (error)
+ return error;
+ }
+
+ xlog_recover_free_trans(trans);
return 0;
}
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
xfs_extent_t *extp;
xfs_fsblock_t startblock_fsb;
- ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+ ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
/*
* First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
extp->ext_len);
}
- efip->efi_flags |= XFS_EFI_RECOVERED;
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
error = xfs_trans_commit(tp, 0);
return error;
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
* Skip EFIs that we've already processed.
*/
efip = (xfs_efi_log_item_t *)lip;
- if (efip->efi_flags & XFS_EFI_RECOVERED) {
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
}
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
- int error;
+ int error, i;
ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
* First do a pass to find all of the cancelled buf log items.
* Store them in the buf_cancel_table for use in the second pass.
*/
- log->l_buf_cancel_table =
- (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
- sizeof(xfs_buf_cancel_t*),
+ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+ sizeof(struct list_head),
KM_SLEEP);
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1);
if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
int i;
for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- ASSERT(log->l_buf_cancel_table[i] == NULL);
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
}
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa..d447aef 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
goto out_unwind;
pag->pag_agno = index;
pag->pag_mount = mp;
- rwlock_init(&pag->pag_ici_lock);
+ spin_lock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
}
/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+ struct xfs_mount *mp)
+{
+ int i;
+
+ for (i = 0; i < XFS_LOWSP_MAX; i++) {
+ __uint64_t space = mp->m_sb.sb_dblocks;
+
+ do_div(space, 100);
+ mp->m_low_space[i] = space * (i + 1);
+ }
+}
+
+
+/*
* Set whether we're using inode alignment.
*/
STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
*/
xfs_set_rw_sizes(mp);
+ /* set the low space thresholds for dynamic preallocation */
+ xfs_set_low_space_thresholds(mp);
+
/*
* Set the inode cluster size.
* This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b49..a62e897 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
xfs_mod_incore_sb(mp, field, delta, rsvd)
#endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+ XFS_LOWSP_1_PCNT = 0,
+ XFS_LOWSP_2_PCNT,
+ XFS_LOWSP_3_PCNT,
+ XFS_LOWSP_4_PCNT,
+ XFS_LOWSP_5_PCNT,
+ XFS_LOWSP_MAX,
+};
+
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
+ int64_t m_low_space[XFS_LOWSP_MAX];
+ /* low free space thresholds */
} xfs_mount_t;
/*
@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+
#endif /* __KERNEL__ */
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b..f80a067 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
* they could be immediately flushed and we'd have to race with the flusher
* trying to pull the item from the AIL as we add it.
*/
-void
+static void
xfs_trans_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
xfs_trans_free(tp);
}
+static inline void
+xfs_log_item_batch_insert(
+ struct xfs_ail *ailp,
+ struct xfs_log_item **log_items,
+ int nr_items,
+ xfs_lsn_t commit_lsn)
+{
+ int i;
+
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+ xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+
+ for (i = 0; i < nr_items; i++)
+ IOP_UNPIN(log_items[i], 0);
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t commit_lsn,
+ int aborted)
+{
+#define LOG_ITEM_BATCH_SIZE 32
+ struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
+ struct xfs_log_vec *lv;
+ int i = 0;
+
+ /* unpin all the log items */
+ for (lv = log_vector; lv; lv = lv->lv_next ) {
+ struct xfs_log_item *lip = lv->lv_item;
+ xfs_lsn_t item_lsn;
+
+ if (aborted)
+ lip->li_flags |= XFS_LI_ABORTED;
+ item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+ /* item_lsn of -1 means the item was freed */
+ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+ continue;
+
+ if (item_lsn != commit_lsn) {
+
+ /*
+ * Not a bulk update option due to unusual item_lsn.
+ * Push into AIL immediately, rechecking the lsn once
+ * we have the ail lock. Then unpin the item.
+ */
+ spin_lock(&ailp->xa_lock);
+ if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+ xfs_trans_ail_update(ailp, lip, item_lsn);
+ else
+ spin_unlock(&ailp->xa_lock);
+ IOP_UNPIN(lip, 0);
+ continue;
+ }
+
+ /* Item is a candidate for bulk AIL insert. */
+ log_items[i++] = lv->lv_item;
+ if (i >= LOG_ITEM_BATCH_SIZE) {
+ xfs_log_item_batch_insert(ailp, log_items,
+ LOG_ITEM_BATCH_SIZE, commit_lsn);
+ i = 0;
+ }
+ }
+
+ /* make sure we insert the remainder! */
+ if (i)
+ xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
+
/*
* Called from the trans_commit code when we notice that
* the filesystem is in the middle of a forced shutdown.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b..c2042b7 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
#define XFS_ALLOC_BTREE_REF 2
#define XFS_BMAP_BTREE_REF 2
#define XFS_DIR_BTREE_REF 2
+#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1
-#define XFS_INO_REF 1
#define XFS_DQUOT_REF 1
#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc90695..c5bbbc4 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
#include "xfs_trans_priv.h"
#include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
xfs_log_move_tail(ailp->xa_mount, 1);
} /* xfs_trans_unlocked_item */
-
/*
- * Update the position of the item in the AIL with the new
- * lsn. If it is not yet in the AIL, add it. Otherwise, move
- * it to its new position by removing it and re-adding it.
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added. Otherwise, it will be repositioned by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
*
- * Wakeup anyone with an lsn less than the item's lsn. If the item
- * we move in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
*
- * This function must be called with the AIL lock held. The lock
- * is dropped before returning.
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held. The lock is dropped
+ * before returning.
*/
void
-xfs_trans_ail_update(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip,
- xfs_lsn_t lsn) __releases(ailp->xa_lock)
+xfs_trans_ail_update_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_log_item **log_items,
+ int nr_items,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock)
{
- xfs_log_item_t *dlip = NULL;
- xfs_log_item_t *mlip; /* ptr to minimum lip */
+ xfs_log_item_t *mlip;
xfs_lsn_t tail_lsn;
+ int mlip_changed = 0;
+ int i;
+ LIST_HEAD(tmp);
mlip = xfs_ail_min(ailp);
- if (lip->li_flags & XFS_LI_IN_AIL) {
- dlip = xfs_ail_delete(ailp, lip);
- ASSERT(dlip == lip);
- xfs_trans_ail_cursor_clear(ailp, dlip);
- } else {
- lip->li_flags |= XFS_LI_IN_AIL;
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ /* check if we really need to move the item */
+ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+ continue;
+
+ xfs_ail_delete(ailp, lip);
+ if (mlip == lip)
+ mlip_changed = 1;
+ } else {
+ lip->li_flags |= XFS_LI_IN_AIL;
+ }
+ lip->li_lsn = lsn;
+ list_add(&lip->li_ail, &tmp);
}
- lip->li_lsn = lsn;
- xfs_ail_insert(ailp, lip);
+ xfs_ail_splice(ailp, &tmp, lsn);
- if (mlip == dlip) {
- mlip = xfs_ail_min(ailp);
- /*
- * It is not safe to access mlip after the AIL lock is
- * dropped, so we must get a copy of li_lsn before we do
- * so. This is especially important on 32-bit platforms
- * where accessing and updating 64-bit values like li_lsn
- * is not atomic.
- */
- tail_lsn = mlip->li_lsn;
- spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, tail_lsn);
- } else {
+ if (!mlip_changed) {
spin_unlock(&ailp->xa_lock);
+ return;
}
-
-} /* xfs_trans_update_ail */
+ /*
+ * It is not safe to access mlip after the AIL lock is dropped, so we
+ * must get a copy of li_lsn before we do so. This is especially
+ * important on 32-bit platforms where accessing and updating 64-bit
+ * values like li_lsn is not atomic.
+ */
+ mlip = xfs_ail_min(ailp);
+ tail_lsn = mlip->li_lsn;
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
/*
- * Delete the given item from the AIL. It must already be in
- * the AIL.
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
*
- * Wakeup anyone with an lsn less than item's lsn. If the item
- * we delete in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
*
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
- * bump the AIL's generation count to indicate that the tree
- * has changed.
+ * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
*
- * This function must be called with the AIL lock held. The lock
- * is dropped before returning.
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held. The lock is dropped
+ * before returning.
*/
void
-xfs_trans_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip) __releases(ailp->xa_lock)
+xfs_trans_ail_delete_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_log_item **log_items,
+ int nr_items) __releases(ailp->xa_lock)
{
- xfs_log_item_t *dlip;
xfs_log_item_t *mlip;
xfs_lsn_t tail_lsn;
+ int mlip_changed = 0;
+ int i;
- if (lip->li_flags & XFS_LI_IN_AIL) {
- mlip = xfs_ail_min(ailp);
- dlip = xfs_ail_delete(ailp, lip);
- ASSERT(dlip == lip);
- xfs_trans_ail_cursor_clear(ailp, dlip);
-
+ mlip = xfs_ail_min(ailp);
- lip->li_flags &= ~XFS_LI_IN_AIL;
- lip->li_lsn = 0;
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ struct xfs_mount *mp = ailp->xa_mount;
- if (mlip == dlip) {
- mlip = xfs_ail_min(ailp);
- /*
- * It is not safe to access mlip after the AIL lock
- * is dropped, so we must get a copy of li_lsn
- * before we do so. This is especially important
- * on 32-bit platforms where accessing and updating
- * 64-bit values like li_lsn is not atomic.
- */
- tail_lsn = mlip ? mlip->li_lsn : 0;
- spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, tail_lsn);
- } else {
spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ return;
}
+
+ xfs_ail_delete(ailp, lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+ if (mlip == lip)
+ mlip_changed = 1;
}
- else {
- /*
- * If the file system is not being shutdown, we are in
- * serious trouble if we get to this stage.
- */
- struct xfs_mount *mp = ailp->xa_mount;
+ if (!mlip_changed) {
spin_unlock(&ailp->xa_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
- "%s: attempting to delete a log item that is not in the AIL",
- __func__);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- }
+ return;
}
-}
-
+ /*
+ * It is not safe to access mlip after the AIL lock is dropped, so we
+ * must get a copy of li_lsn before we do so. This is especially
+ * important on 32-bit platforms where accessing and updating 64-bit
+ * values like li_lsn is not atomic. It is possible we've emptied the
+ * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+ */
+ mlip = xfs_ail_min(ailp);
+ tail_lsn = mlip ? mlip->li_lsn : 0;
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
/*
* The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
}
/*
- * Insert the given log item into the AIL.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
+ * splice the log item list into the AIL at the given LSN.
*/
STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
struct xfs_ail *ailp,
- xfs_log_item_t *lip)
-/* ARGSUSED */
+ struct list_head *list,
+ xfs_lsn_t lsn)
{
xfs_log_item_t *next_lip;
@@ -640,39 +660,33 @@ xfs_ail_insert(
* If the list is empty, just insert the item.
*/
if (list_empty(&ailp->xa_ail)) {
- list_add(&lip->li_ail, &ailp->xa_ail);
+ list_splice(list, &ailp->xa_ail);
return;
}
list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
- if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+ if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
break;
}
ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
- (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-
- list_add(&lip->li_ail, &next_lip->li_ail);
+ (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
- xfs_ail_check(ailp, lip);
+ list_splice_init(list, &next_lip->li_ail);
return;
}
/*
* Delete the given item from the AIL. Return a pointer to the item.
*/
-/*ARGSUSED*/
-STATIC xfs_log_item_t *
+STATIC void
xfs_ail_delete(
struct xfs_ail *ailp,
xfs_log_item_t *lip)
-/* ARGSUSED */
{
xfs_ail_check(ailp, lip);
-
list_del(&lip->li_ail);
-
- return lip;
+ xfs_trans_ail_cursor_clear(ailp, lip);
}
/*
@@ -682,7 +696,6 @@ xfs_ail_delete(
STATIC xfs_log_item_t *
xfs_ail_min(
struct xfs_ail *ailp)
-/* ARGSUSED */
{
if (list_empty(&ailp->xa_ail))
return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
xfs_ail_next(
struct xfs_ail *ailp,
xfs_log_item_t *lip)
-/* ARGSUSED */
{
if (lip->li_ail.next == &ailp->xa_ail)
return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e..f7590f5 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
- next_extent = efip->efi_next_extent;
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
ASSERT(next_extent < efip->efi_format.efi_nextents);
extp = &(efip->efi_format.efi_extents[next_extent]);
extp->ext_start = start_block;
extp->ext_len = ext_len;
- efip->efi_next_extent++;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c..35162c2 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
int flags);
-void xfs_trans_item_committed(struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn, int aborted);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+ xfs_lsn_t commit_lsn, int aborted);
/*
* AIL traversal cursor.
*
@@ -73,12 +75,29 @@ struct xfs_ail {
/*
* From xfs_trans_ail.c
*/
-void xfs_trans_ail_update(struct xfs_ail *ailp,
- struct xfs_log_item *lip, xfs_lsn_t lsn)
- __releases(ailp->xa_lock);
-void xfs_trans_ail_delete(struct xfs_ail *ailp,
- struct xfs_log_item *lip)
- __releases(ailp->xa_lock);
+void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+ struct xfs_log_item **log_items, int nr_items,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_update(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock)
+{
+ xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+
+void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+ struct xfs_log_item **log_items, int nr_items)
+ __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip) __releases(ailp->xa_lock)
+{
+ xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+
void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_trans_unlocked_item(struct xfs_ail *,
xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4..d8e6f8c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
}
- if (ip->i_d.di_nlink != 0) {
- if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
- (!(ip->i_d.di_flags &
- (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+ if (ip->i_d.di_nlink == 0)
+ return 0;
- /*
- * If we can't get the iolock just skip truncating
- * the blocks past EOF because we could deadlock
- * with the mmap_sem otherwise. We'll get another
- * chance to drop them once the last reference to
- * the inode is dropped, so we'll never leak blocks
- * permanently.
- */
- error = xfs_free_eofblocks(mp, ip,
- XFS_FREE_EOF_TRYLOCK);
- if (error)
- return error;
- }
- }
+ if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+ ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+ ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
+ (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+ /*
+ * If we can't get the iolock just skip truncating the blocks
+ * past EOF because we could deadlock with the mmap_sem
+ * otherwise. We'll get another chance to drop them once the
+ * last reference to the inode is dropped, so we'll never leak
+ * blocks permanently.
+ *
+ * Further, check if the inode is being opened, written and
+ * closed frequently and we have delayed allocation blocks
+ * oustanding (e.g. streaming writes from the NFS server),
+ * truncating the blocks past EOF will cause fragmentation to
+ * occur.
+ *
+ * In this case don't do the truncation, either, but we have to
+ * be careful how we detect this case. Blocks beyond EOF show
+ * up as i_delayed_blks even when the inode is clean, so we
+ * need to truncate them away first before checking for a dirty
+ * release. Hence on the first dirty close we will still remove
+ * the speculative allocation, but after that we will leave it
+ * in place.
+ */
+ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+ return 0;
+
+ error = xfs_free_eofblocks(mp, ip,
+ XFS_FREE_EOF_TRYLOCK);
+ if (error)
+ return error;
+
+ /* delalloc blocks after truncation means it really is dirty */
+ if (ip->i_delayed_blks)
+ xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+ }
return 0;
}
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index a90b389..1c70028 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
extern int ddebug_remove_module(const char *mod_name);
#define dynamic_pr_debug(fmt, ...) do { \
- __label__ do_printk; \
- __label__ out; \
static struct _ddebug descriptor \
__used \
__attribute__((section("__verbose"), aligned(8))) = \
{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
_DPRINTK_FLAGS_DEFAULT }; \
- JUMP_LABEL(&descriptor.enabled, do_printk); \
- goto out; \
-do_printk: \
- printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
-out: ; \
+ if (unlikely(descriptor.enabled)) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
#define dynamic_dev_dbg(dev, fmt, ...) do { \
- __label__ do_printk; \
- __label__ out; \
static struct _ddebug descriptor \
__used \
__attribute__((section("__verbose"), aligned(8))) = \
{ KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
_DPRINTK_FLAGS_DEFAULT }; \
- JUMP_LABEL(&descriptor.enabled, do_printk); \
- goto out; \
-do_printk: \
- dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
-out: ; \
+ if (unlikely(descriptor.enabled)) \
+ dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
} while (0)
#else
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 085f041..8e70310 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -57,6 +57,10 @@
* is configured in 4-bit mode.
*/
#define TMIO_MMC_BLKSZ_2BYTES (1 << 1)
+/*
+ * Some controllers can support SDIO IRQ signalling.
+ */
+#define TMIO_MMC_SDIO_IRQ (1 << 2)
int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base);
int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base);
@@ -66,6 +70,7 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
struct tmio_mmc_dma {
void *chan_priv_tx;
void *chan_priv_rx;
+ int alignment_shift;
};
/*
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
new file mode 100644
index 0000000..16b0261
--- /dev/null
+++ b/include/linux/mmc/dw_mmc.h
@@ -0,0 +1,217 @@
+/*
+ * Synopsys DesignWare Multimedia Card Interface driver
+ * (Based on NXP driver for lpc 31xx)
+ *
+ * Copyright (C) 2009 NXP Semiconductors
+ * Copyright (C) 2009, 2010 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_MMC_DW_MMC_H_
+#define _LINUX_MMC_DW_MMC_H_
+
+#define MAX_MCI_SLOTS 2
+
+enum dw_mci_state {
+ STATE_IDLE = 0,
+ STATE_SENDING_CMD,
+ STATE_SENDING_DATA,
+ STATE_DATA_BUSY,
+ STATE_SENDING_STOP,
+ STATE_DATA_ERROR,
+};
+
+enum {
+ EVENT_CMD_COMPLETE = 0,
+ EVENT_XFER_COMPLETE,
+ EVENT_DATA_COMPLETE,
+ EVENT_DATA_ERROR,
+ EVENT_XFER_ERROR
+};
+
+struct mmc_data;
+
+/**
+ * struct dw_mci - MMC controller state shared between all slots
+ * @lock: Spinlock protecting the queue and associated data.
+ * @regs: Pointer to MMIO registers.
+ * @sg: Scatterlist entry currently being processed by PIO code, if any.
+ * @pio_offset: Offset into the current scatterlist entry.
+ * @cur_slot: The slot which is currently using the controller.
+ * @mrq: The request currently being processed on @cur_slot,
+ * or NULL if the controller is idle.
+ * @cmd: The command currently being sent to the card, or NULL.
+ * @data: The data currently being transferred, or NULL if no data
+ * transfer is in progress.
+ * @use_dma: Whether DMA channel is initialized or not.
+ * @sg_dma: Bus address of DMA buffer.
+ * @sg_cpu: Virtual address of DMA buffer.
+ * @dma_ops: Pointer to platform-specific DMA callbacks.
+ * @cmd_status: Snapshot of SR taken upon completion of the current
+ * command. Only valid when EVENT_CMD_COMPLETE is pending.
+ * @data_status: Snapshot of SR taken upon completion of the current
+ * data transfer. Only valid when EVENT_DATA_COMPLETE or
+ * EVENT_DATA_ERROR is pending.
+ * @stop_cmdr: Value to be loaded into CMDR when the stop command is
+ * to be sent.
+ * @dir_status: Direction of current transfer.
+ * @tasklet: Tasklet running the request state machine.
+ * @card_tasklet: Tasklet handling card detect.
+ * @pending_events: Bitmask of events flagged by the interrupt handler
+ * to be processed by the tasklet.
+ * @completed_events: Bitmask of events which the state machine has
+ * processed.
+ * @state: Tasklet state.
+ * @queue: List of slots waiting for access to the controller.
+ * @bus_hz: The rate of @mck in Hz. This forms the basis for MMC bus
+ * rate and timeout calculations.
+ * @current_speed: Configured rate of the controller.
+ * @num_slots: Number of slots available.
+ * @pdev: Platform device associated with the MMC controller.
+ * @pdata: Platform data associated with the MMC controller.
+ * @slot: Slots sharing this MMC controller.
+ * @data_shift: log2 of FIFO item size.
+ * @push_data: Pointer to FIFO push function.
+ * @pull_data: Pointer to FIFO pull function.
+ * @quirks: Set of quirks that apply to specific versions of the IP.
+ *
+ * Locking
+ * =======
+ *
+ * @lock is a softirq-safe spinlock protecting @queue as well as
+ * @cur_slot, @mrq and @state. These must always be updated
+ * at the same time while holding @lock.
+ *
+ * The @mrq field of struct dw_mci_slot is also protected by @lock,
+ * and must always be written at the same time as the slot is added to
+ * @queue.
+ *
+ * @pending_events and @completed_events are accessed using atomic bit
+ * operations, so they don't need any locking.
+ *
+ * None of the fields touched by the interrupt handler need any
+ * locking. However, ordering is important: Before EVENT_DATA_ERROR or
+ * EVENT_DATA_COMPLETE is set in @pending_events, all data-related
+ * interrupts must be disabled and @data_status updated with a
+ * snapshot of SR. Similarly, before EVENT_CMD_COMPLETE is set, the
+ * CMDRDY interupt must be disabled and @cmd_status updated with a
+ * snapshot of SR, and before EVENT_XFER_COMPLETE can be set, the
+ * bytes_xfered field of @data must be written. This is ensured by
+ * using barriers.
+ */
+struct dw_mci {
+ spinlock_t lock;
+ void __iomem *regs;
+
+ struct scatterlist *sg;
+ unsigned int pio_offset;
+
+ struct dw_mci_slot *cur_slot;
+ struct mmc_request *mrq;
+ struct mmc_command *cmd;
+ struct mmc_data *data;
+
+ /* DMA interface members*/
+ int use_dma;
+
+ dma_addr_t sg_dma;
+ void *sg_cpu;
+ struct dw_mci_dma_ops *dma_ops;
+#ifdef CONFIG_MMC_DW_IDMAC
+ unsigned int ring_size;
+#else
+ struct dw_mci_dma_data *dma_data;
+#endif
+ u32 cmd_status;
+ u32 data_status;
+ u32 stop_cmdr;
+ u32 dir_status;
+ struct tasklet_struct tasklet;
+ struct tasklet_struct card_tasklet;
+ unsigned long pending_events;
+ unsigned long completed_events;
+ enum dw_mci_state state;
+ struct list_head queue;
+
+ u32 bus_hz;
+ u32 current_speed;
+ u32 num_slots;
+ struct platform_device *pdev;
+ struct dw_mci_board *pdata;
+ struct dw_mci_slot *slot[MAX_MCI_SLOTS];
+
+ /* FIFO push and pull */
+ int data_shift;
+ void (*push_data)(struct dw_mci *host, void *buf, int cnt);
+ void (*pull_data)(struct dw_mci *host, void *buf, int cnt);
+
+ /* Workaround flags */
+ u32 quirks;
+};
+
+/* DMA ops for Internal/External DMAC interface */
+struct dw_mci_dma_ops {
+ /* DMA Ops */
+ int (*init)(struct dw_mci *host);
+ void (*start)(struct dw_mci *host, unsigned int sg_len);
+ void (*complete)(struct dw_mci *host);
+ void (*stop)(struct dw_mci *host);
+ void (*cleanup)(struct dw_mci *host);
+ void (*exit)(struct dw_mci *host);
+};
+
+/* IP Quirks/flags. */
+/* No special quirks or flags to cater for */
+#define DW_MCI_QUIRK_NONE 0
+/* DTO fix for command transmission with IDMAC configured */
+#define DW_MCI_QUIRK_IDMAC_DTO 1
+/* delay needed between retries on some 2.11a implementations */
+#define DW_MCI_QUIRK_RETRY_DELAY 2
+/* High Speed Capable - Supports HS cards (upto 50MHz) */
+#define DW_MCI_QUIRK_HIGHSPEED 4
+
+
+struct dma_pdata;
+
+struct block_settings {
+ unsigned short max_segs; /* see blk_queue_max_segments */
+ unsigned int max_blk_size; /* maximum size of one mmc block */
+ unsigned int max_blk_count; /* maximum number of blocks in one req*/
+ unsigned int max_req_size; /* maximum number of bytes in one req*/
+ unsigned int max_seg_size; /* see blk_queue_max_segment_size */
+};
+
+/* Board platform data */
+struct dw_mci_board {
+ u32 num_slots;
+
+ u32 quirks; /* Workaround / Quirk flags */
+ unsigned int bus_hz; /* Bus speed */
+
+ /* delay in mS before detecting cards after interrupt */
+ u32 detect_delay_ms;
+
+ int (*init)(u32 slot_id, irq_handler_t , void *);
+ int (*get_ro)(u32 slot_id);
+ int (*get_cd)(u32 slot_id);
+ int (*get_ocr)(u32 slot_id);
+ int (*get_bus_wd)(u32 slot_id);
+ /*
+ * Enable power to selected slot and set voltage to desired level.
+ * Voltage levels are specified using MMC_VDD_xxx defines defined
+ * in linux/mmc/host.h file.
+ */
+ void (*setpower)(u32 slot_id, u32 volt);
+ void (*exit)(u32 slot_id);
+ void (*select_slot)(u32 slot_id);
+
+ struct dw_mci_dma_ops *dma_ops;
+ struct dma_pdata *data;
+ struct block_settings *blk_settings;
+};
+
+#endif /* _LINUX_MMC_DW_MMC_H_ */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 30f6fad..bcb793e 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -131,6 +131,9 @@ struct mmc_host {
unsigned int f_max;
unsigned int f_init;
u32 ocr_avail;
+ u32 ocr_avail_sdio; /* SDIO-specific OCR */
+ u32 ocr_avail_sd; /* SD-specific OCR */
+ u32 ocr_avail_mmc; /* MMC-specific OCR */
struct notifier_block pm_notify;
#define MMC_VDD_165_195 0x00000080 /* VDD voltage 1.65 - 1.95 */
@@ -169,9 +172,20 @@ struct mmc_host {
#define MMC_CAP_1_2V_DDR (1 << 12) /* can support */
/* DDR mode at 1.2V */
#define MMC_CAP_POWER_OFF_CARD (1 << 13) /* Can power off after boot */
+#define MMC_CAP_BUS_WIDTH_TEST (1 << 14) /* CMD14/CMD19 bus width ok */
mmc_pm_flag_t pm_caps; /* supported pm features */
+#ifdef CONFIG_MMC_CLKGATE
+ int clk_requests; /* internal reference counter */
+ unsigned int clk_delay; /* number of MCI clk hold cycles */
+ bool clk_gated; /* clock gated */
+ struct work_struct clk_gate_work; /* delayed clock gate */
+ unsigned int clk_old; /* old clock value cache */
+ spinlock_t clk_lock; /* lock for clk fields */
+ struct mutex clk_gate_mutex; /* mutex for clock gating */
+#endif
+
/* host specific block data */
unsigned int max_seg_size; /* see blk_queue_max_segment_size */
unsigned short max_segs; /* see blk_queue_max_segments */
@@ -307,5 +321,10 @@ static inline int mmc_card_is_removable(struct mmc_host *host)
return !(host->caps & MMC_CAP_NONREMOVABLE) && mmc_assume_removable;
}
+static inline int mmc_card_is_powered_resumed(struct mmc_host *host)
+{
+ return host->pm_flags & MMC_PM_KEEP_POWER;
+}
+
#endif
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 956fbd87..612301f 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -40,7 +40,9 @@
#define MMC_READ_DAT_UNTIL_STOP 11 /* adtc [31:0] dadr R1 */
#define MMC_STOP_TRANSMISSION 12 /* ac R1b */
#define MMC_SEND_STATUS 13 /* ac [31:16] RCA R1 */
+#define MMC_BUS_TEST_R 14 /* adtc R1 */
#define MMC_GO_INACTIVE_STATE 15 /* ac [31:16] RCA */
+#define MMC_BUS_TEST_W 19 /* adtc R1 */
#define MMC_SPI_READ_OCR 58 /* spi spi_R3 */
#define MMC_SPI_CRC_ON_OFF 59 /* spi [0:0] flag spi_R1 */
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 1fdc673..83bd9f7 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -83,6 +83,8 @@ struct sdhci_host {
#define SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 (1<<28)
/* Controller doesn't have HISPD bit field in HI-SPEED SD card */
#define SDHCI_QUIRK_NO_HISPD_BIT (1<<29)
+/* Controller treats ADMA descriptors with length 0000h incorrectly */
+#define SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC (1<<30)
int irq; /* Device IRQ */
void __iomem *ioaddr; /* Mapped address */
@@ -139,6 +141,10 @@ struct sdhci_host {
unsigned int caps; /* Alternative capabilities */
+ unsigned int ocr_avail_sdio; /* OCR bit masks */
+ unsigned int ocr_avail_sd;
+ unsigned int ocr_avail_mmc;
+
unsigned long private[0] ____cacheline_aligned;
};
#endif /* __SDHCI_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index cb845c16..ab47732 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -518,6 +518,7 @@
#define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303
#define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304
#define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603
+#define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703
#define PCI_DEVICE_ID_AMD_LANCE 0x2000
#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001
#define PCI_DEVICE_ID_AMD_SCSI 0x2020
@@ -1650,6 +1651,11 @@
#define PCI_DEVICE_ID_O2_6836 0x6836
#define PCI_DEVICE_ID_O2_6812 0x6872
#define PCI_DEVICE_ID_O2_6933 0x6933
+#define PCI_DEVICE_ID_O2_8120 0x8120
+#define PCI_DEVICE_ID_O2_8220 0x8220
+#define PCI_DEVICE_ID_O2_8221 0x8221
+#define PCI_DEVICE_ID_O2_8320 0x8320
+#define PCI_DEVICE_ID_O2_8321 0x8321
#define PCI_VENDOR_ID_3DFX 0x121a
#define PCI_DEVICE_ID_3DFX_VOODOO 0x0001
@@ -2363,6 +2369,8 @@
#define PCI_DEVICE_ID_JMICRON_JMB38X_SD 0x2381
#define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
#define PCI_DEVICE_ID_JMICRON_JMB38X_MS 0x2383
+#define PCI_DEVICE_ID_JMICRON_JMB388_SD 0x2391
+#define PCI_DEVICE_ID_JMICRON_JMB388_ESD 0x2392
#define PCI_VENDOR_ID_KORENIX 0x1982
#define PCI_DEVICE_ID_KORENIX_JETCARDF0 0x1600
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 14dbc83..3c995b4 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -107,12 +107,17 @@ extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year
extern int rtc_valid_tm(struct rtc_time *tm);
extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
+ktime_t rtc_tm_to_ktime(struct rtc_time tm);
+struct rtc_time rtc_ktime_to_tm(ktime_t kt);
+
#include <linux/device.h>
#include <linux/seq_file.h>
#include <linux/cdev.h>
#include <linux/poll.h>
#include <linux/mutex.h>
+#include <linux/timerqueue.h>
+#include <linux/workqueue.h>
extern struct class *rtc_class;
@@ -151,7 +156,19 @@ struct rtc_class_ops {
};
#define RTC_DEVICE_NAME_SIZE 20
-struct rtc_task;
+typedef struct rtc_task {
+ void (*func)(void *private_data);
+ void *private_data;
+} rtc_task_t;
+
+
+struct rtc_timer {
+ struct rtc_task task;
+ struct timerqueue_node node;
+ ktime_t period;
+ int enabled;
+};
+
/* flags */
#define RTC_DEV_BUSY 0
@@ -179,16 +196,13 @@ struct rtc_device
spinlock_t irq_task_lock;
int irq_freq;
int max_user_freq;
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
- struct work_struct uie_task;
- struct timer_list uie_timer;
- /* Those fields are protected by rtc->irq_lock */
- unsigned int oldsecs;
- unsigned int uie_irq_active:1;
- unsigned int stop_uie_polling:1;
- unsigned int uie_task_active:1;
- unsigned int uie_timer_active:1;
-#endif
+
+ struct timerqueue_head timerqueue;
+ struct rtc_timer aie_timer;
+ struct rtc_timer uie_rtctimer;
+ struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */
+ int pie_enabled;
+ struct work_struct irqwork;
};
#define to_rtc_device(d) container_of(d, struct rtc_device, dev)
@@ -224,15 +238,22 @@ extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
unsigned int enabled);
-typedef struct rtc_task {
- void (*func)(void *private_data);
- void *private_data;
-} rtc_task_t;
+void rtc_aie_update_irq(void *private);
+void rtc_uie_update_irq(void *private);
+enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer);
int rtc_register(rtc_task_t *task);
int rtc_unregister(rtc_task_t *task);
int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
+void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
+int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
+ ktime_t expires, ktime_t period);
+int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
+void rtc_timer_do_work(struct work_struct *work);
+
static inline bool is_leap_year(unsigned int year)
{
return (!(year % 4) && (year % 100)) || !(year % 400);
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d3e4f87..c681461 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -32,7 +32,7 @@ struct tracepoint {
int state; /* State. */
void (*regfunc)(void);
void (*unregfunc)(void);
- struct tracepoint_func *funcs;
+ struct tracepoint_func __rcu *funcs;
} __attribute__((aligned(32))); /*
* Aligned on 32 bytes because it is
* globally visible and gcc happily
@@ -326,7 +326,7 @@ do_trace: \
* memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
* __entry->next_pid = next->pid;
* __entry->next_prio = next->prio;
- * )
+ * ),
*
* *
* * Formatted output of a trace record via TP_printk().
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index b0b4eb2..da39b22 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -21,6 +21,16 @@
#undef CREATE_TRACE_POINTS
#include <linux/stringify.h>
+/*
+ * module.h includes tracepoints, and because ftrace.h
+ * pulls in module.h:
+ * trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
+ * linux/ftrace.h -> linux/module.h
+ * we must include module.h here before we play with any of
+ * the TRACE_EVENT() macros, otherwise the tracepoints included
+ * by module.h may break the build.
+ */
+#include <linux/module.h>
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 75ce9d5..f10293c 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb,
TP_fast_assign(
__entry->skbaddr = skb;
- if (skb) {
- __entry->protocol = ntohs(skb->protocol);
- }
+ __entry->protocol = ntohs(skb->protocol);
__entry->location = location;
),
diff --git a/kernel/Makefile b/kernel/Makefile
index 33e0a39..5669f71 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 89c7486..f9a45eb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
exit_fs(tsk);
check_stack_usage();
exit_thread();
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
+
cgroup_exit(tsk, 1);
if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_event_exit_task(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf..b782b7a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,12 @@
#include <asm/irq_regs.h>
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
atomic_t perf_task_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
return "pmu";
}
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
put_ctx(ctx);
}
-static inline u64 perf_clock(void)
-{
- return local_clock();
-}
-
/*
* Update the record of the current time in a context.
*/
@@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
ctx->timestamp = now;
}
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return ctx ? ctx->time : 0;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
@@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event)
return;
if (ctx->is_active)
- run_end = ctx->time;
+ run_end = perf_event_time(event);
else
run_end = event->tstamp_stopped;
@@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event)
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
- run_end = ctx->time;
+ run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
}
@@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
@@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event,
&& !event_filter_match(event)) {
delta = ctx->time - event->tstamp_stopped;
event->tstamp_running += delta;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event,
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -768,6 +787,8 @@ event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -784,9 +805,9 @@ event_sched_in(struct perf_event *event,
return -EAGAIN;
}
- event->tstamp_running += ctx->time - event->tstamp_stopped;
+ event->tstamp_running += tstamp - event->tstamp_stopped;
- event->shadow_ctx_time = ctx->time - ctx->timestamp;
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -898,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = ctx->time;
- event->tstamp_running = ctx->time;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
}
/*
@@ -937,7 +960,7 @@ static void __perf_install_in_context(void *info)
add_event_to_ctx(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
@@ -1042,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled =
- ctx->time - sub->total_time_enabled;
- }
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}
}
@@ -1082,7 +1104,7 @@ static void __perf_event_enable(void *info)
goto unlock;
__perf_event_mark_enabled(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
@@ -1193,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
return 0;
}
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
@@ -1435,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, 1))
@@ -1467,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1694,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
hwc = &event->hw;
@@ -3893,7 +3909,7 @@ static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm || event->attr.mmap ||
@@ -4030,7 +4046,7 @@ static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm)
@@ -4178,7 +4194,7 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if ((!executable && event->attr.mmap_data) ||
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f3381..761c510 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959..dc53ecb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
-
-
event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
sizeof(*entry), flags, pc);
if (!event)
- return;
+ goto out_drop_count;
entry = ring_buffer_event_data(event);
entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
if (!filter_check_discard(call, entry, buffer, event))
ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
__this_cpu_dec(user_stack_count);
-
out:
preempt_enable();
}
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 3094318..b335acb 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
else if (!dp->flags)
dt->num_enabled++;
dp->flags = newflags;
- if (newflags) {
- jump_label_enable(&dp->enabled);
- } else {
- jump_label_disable(&dp->enabled);
- }
+ if (newflags)
+ dp->enabled = 1;
+ else
+ dp->enabled = 0;
if (verbose)
printk(KERN_INFO
"ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 1b9b13e..2b5387d 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -227,7 +227,7 @@ ifndef PERF_DEBUG
CFLAGS_OPTIMIZE = -O6
endif
-CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+CFLAGS = -fno-omit-frame-pointer -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
EXTLIBS = -lpthread -lrt -lelf -lm
ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
ALL_LDFLAGS = $(LDFLAGS)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 7bc0490..7069bd3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -331,6 +331,9 @@ try_again:
else if (err == ENODEV && cpu_list) {
die("No such device - did you specify"
" an out-of-range profile CPU?\n");
+ } else if (err == ENOENT) {
+ die("%s event is not supported. ",
+ event_name(evsel));
} else if (err == EINVAL && sample_id_all_avail) {
/*
* Old kernel, no attr->sample_id_type_all field
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 7a4ebeb..abd4b84 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -489,7 +489,8 @@ static void create_tasks(void)
err = pthread_attr_init(&attr);
BUG_ON(err);
- err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
+ err = pthread_attr_setstacksize(&attr,
+ (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
BUG_ON(err);
err = pthread_mutex_lock(&start_work_mutex);
BUG_ON(err);
@@ -1861,7 +1862,7 @@ static int __cmd_record(int argc, const char **argv)
rec_argc = ARRAY_SIZE(record_args) + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
- if (rec_argv)
+ if (rec_argv == NULL)
return -ENOMEM;
for (i = 0; i < ARRAY_SIZE(record_args); i++)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 02b2d80..c385a63 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -316,6 +316,8 @@ static int run_perf_stat(int argc __used, const char **argv)
"\t Consider tweaking"
" /proc/sys/kernel/perf_event_paranoid or running as root.",
system_wide ? "system-wide " : "");
+ } else if (errno == ENOENT) {
+ error("%s event is not supported. ", event_name(counter));
} else {
error("open_counter returned with %d (%s). "
"/bin/dmesg may provide additional information.\n",
@@ -683,8 +685,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
nr_counters = ARRAY_SIZE(default_attrs);
for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
- pos = perf_evsel__new(default_attrs[c].type,
- default_attrs[c].config,
+ pos = perf_evsel__new(&default_attrs[c],
nr_counters);
if (pos == NULL)
goto out;
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 1c98434..ed56961 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -234,6 +234,7 @@ out:
return err;
}
+#include "util/cpumap.h"
#include "util/evsel.h"
#include <sys/types.h>
@@ -264,6 +265,7 @@ static int test__open_syscall_event(void)
int err = -1, fd;
struct thread_map *threads;
struct perf_evsel *evsel;
+ struct perf_event_attr attr;
unsigned int nr_open_calls = 111, i;
int id = trace_event__id("sys_enter_open");
@@ -278,7 +280,10 @@ static int test__open_syscall_event(void)
return -1;
}
- evsel = perf_evsel__new(PERF_TYPE_TRACEPOINT, id, 0);
+ memset(&attr, 0, sizeof(attr));
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.config = id;
+ evsel = perf_evsel__new(&attr, 0);
if (evsel == NULL) {
pr_debug("perf_evsel__new\n");
goto out_thread_map_delete;
@@ -317,6 +322,111 @@ out_thread_map_delete:
return err;
}
+#include <sched.h>
+
+static int test__open_syscall_event_on_all_cpus(void)
+{
+ int err = -1, fd, cpu;
+ struct thread_map *threads;
+ struct cpu_map *cpus;
+ struct perf_evsel *evsel;
+ struct perf_event_attr attr;
+ unsigned int nr_open_calls = 111, i;
+ cpu_set_t *cpu_set;
+ size_t cpu_set_size;
+ int id = trace_event__id("sys_enter_open");
+
+ if (id < 0) {
+ pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+ return -1;
+ }
+
+ threads = thread_map__new(-1, getpid());
+ if (threads == NULL) {
+ pr_debug("thread_map__new\n");
+ return -1;
+ }
+
+ cpus = cpu_map__new(NULL);
+ if (threads == NULL) {
+ pr_debug("thread_map__new\n");
+ return -1;
+ }
+
+ cpu_set = CPU_ALLOC(cpus->nr);
+
+ if (cpu_set == NULL)
+ goto out_thread_map_delete;
+
+ cpu_set_size = CPU_ALLOC_SIZE(cpus->nr);
+ CPU_ZERO_S(cpu_set_size, cpu_set);
+
+ memset(&attr, 0, sizeof(attr));
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.config = id;
+ evsel = perf_evsel__new(&attr, 0);
+ if (evsel == NULL) {
+ pr_debug("perf_evsel__new\n");
+ goto out_cpu_free;
+ }
+
+ if (perf_evsel__open(evsel, cpus, threads) < 0) {
+ pr_debug("failed to open counter: %s, "
+ "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+ strerror(errno));
+ goto out_evsel_delete;
+ }
+
+ for (cpu = 0; cpu < cpus->nr; ++cpu) {
+ unsigned int ncalls = nr_open_calls + cpu;
+
+ CPU_SET(cpu, cpu_set);
+ sched_setaffinity(0, cpu_set_size, cpu_set);
+ for (i = 0; i < ncalls; ++i) {
+ fd = open("/etc/passwd", O_RDONLY);
+ close(fd);
+ }
+ CPU_CLR(cpu, cpu_set);
+ }
+
+ /*
+ * Here we need to explicitely preallocate the counts, as if
+ * we use the auto allocation it will allocate just for 1 cpu,
+ * as we start by cpu 0.
+ */
+ if (perf_evsel__alloc_counts(evsel, cpus->nr) < 0) {
+ pr_debug("perf_evsel__alloc_counts(ncpus=%d)\n", cpus->nr);
+ goto out_close_fd;
+ }
+
+ for (cpu = 0; cpu < cpus->nr; ++cpu) {
+ unsigned int expected;
+
+ if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
+ pr_debug("perf_evsel__open_read_on_cpu\n");
+ goto out_close_fd;
+ }
+
+ expected = nr_open_calls + cpu;
+ if (evsel->counts->cpu[cpu].val != expected) {
+ pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %Ld\n",
+ expected, cpu, evsel->counts->cpu[cpu].val);
+ goto out_close_fd;
+ }
+ }
+
+ err = 0;
+out_close_fd:
+ perf_evsel__close_fd(evsel, 1, threads->nr);
+out_evsel_delete:
+ perf_evsel__delete(evsel);
+out_cpu_free:
+ CPU_FREE(cpu_set);
+out_thread_map_delete:
+ thread_map__delete(threads);
+ return err;
+}
+
static struct test {
const char *desc;
int (*func)(void);
@@ -330,6 +440,10 @@ static struct test {
.func = test__open_syscall_event,
},
{
+ .desc = "detect open syscall event on all cpus",
+ .func = test__open_syscall_event_on_all_cpus,
+ },
+ {
.func = NULL,
},
};
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1e67ab9..6ce4042 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1247,6 +1247,8 @@ try_again:
die("Permission error - are you root?\n"
"\t Consider tweaking"
" /proc/sys/kernel/perf_event_paranoid.\n");
+ if (err == ENOENT)
+ die("%s event is not supported. ", event_name(evsel));
/*
* If it's cycles then fall back to hrtimer
* based cpu-clock-tick sw counter, which
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index c95267e..f5cfed6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -6,14 +6,13 @@
#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx)
+struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
{
struct perf_evsel *evsel = zalloc(sizeof(*evsel));
if (evsel != NULL) {
evsel->idx = idx;
- evsel->attr.type = type;
- evsel->attr.config = config;
+ evsel->attr = *attr;
INIT_LIST_HEAD(&evsel->node);
}
@@ -128,59 +127,75 @@ int __perf_evsel__read(struct perf_evsel *evsel,
return 0;
}
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
+ struct thread_map *threads)
{
- int cpu;
+ int cpu, thread;
- if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, 1) < 0)
+ if (evsel->fd == NULL &&
+ perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
return -1;
for (cpu = 0; cpu < cpus->nr; cpu++) {
- FD(evsel, cpu, 0) = sys_perf_event_open(&evsel->attr, -1,
- cpus->map[cpu], -1, 0);
- if (FD(evsel, cpu, 0) < 0)
- goto out_close;
+ for (thread = 0; thread < threads->nr; thread++) {
+ FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
+ threads->map[thread],
+ cpus->map[cpu], -1, 0);
+ if (FD(evsel, cpu, thread) < 0)
+ goto out_close;
+ }
}
return 0;
out_close:
- while (--cpu >= 0) {
- close(FD(evsel, cpu, 0));
- FD(evsel, cpu, 0) = -1;
- }
+ do {
+ while (--thread >= 0) {
+ close(FD(evsel, cpu, thread));
+ FD(evsel, cpu, thread) = -1;
+ }
+ thread = threads->nr;
+ } while (--cpu >= 0);
return -1;
}
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+static struct {
+ struct cpu_map map;
+ int cpus[1];
+} empty_cpu_map = {
+ .map.nr = 1,
+ .cpus = { -1, },
+};
+
+static struct {
+ struct thread_map map;
+ int threads[1];
+} empty_thread_map = {
+ .map.nr = 1,
+ .threads = { -1, },
+};
+
+int perf_evsel__open(struct perf_evsel *evsel,
+ struct cpu_map *cpus, struct thread_map *threads)
{
- int thread;
-
- if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, 1, threads->nr))
- return -1;
- for (thread = 0; thread < threads->nr; thread++) {
- FD(evsel, 0, thread) = sys_perf_event_open(&evsel->attr,
- threads->map[thread], -1, -1, 0);
- if (FD(evsel, 0, thread) < 0)
- goto out_close;
+ if (cpus == NULL) {
+ /* Work around old compiler warnings about strict aliasing */
+ cpus = &empty_cpu_map.map;
}
- return 0;
+ if (threads == NULL)
+ threads = &empty_thread_map.map;
-out_close:
- while (--thread >= 0) {
- close(FD(evsel, 0, thread));
- FD(evsel, 0, thread) = -1;
- }
- return -1;
+ return __perf_evsel__open(evsel, cpus, threads);
}
-int perf_evsel__open(struct perf_evsel *evsel,
- struct cpu_map *cpus, struct thread_map *threads)
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
{
- if (threads == NULL)
- return perf_evsel__open_per_cpu(evsel, cpus);
+ return __perf_evsel__open(evsel, cpus, &empty_thread_map.map);
+}
- return perf_evsel__open_per_thread(evsel, threads);
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+{
+ return __perf_evsel__open(evsel, &empty_cpu_map.map, threads);
}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a0ccd69..b2d755f 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -37,7 +37,7 @@ struct perf_evsel {
struct cpu_map;
struct thread_map;
-struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx);
+struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
void perf_evsel__delete(struct perf_evsel *evsel);
int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 649083f..5cb6f4b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -490,6 +490,31 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
return EVT_HANDLED_ALL;
}
+static int store_event_type(const char *orgname)
+{
+ char filename[PATH_MAX], *c;
+ FILE *file;
+ int id, n;
+
+ sprintf(filename, "%s/", debugfs_path);
+ strncat(filename, orgname, strlen(orgname));
+ strcat(filename, "/id");
+
+ c = strchr(filename, ':');
+ if (c)
+ *c = '/';
+
+ file = fopen(filename, "r");
+ if (!file)
+ return 0;
+ n = fscanf(file, "%i", &id);
+ fclose(file);
+ if (n < 1) {
+ pr_err("cannot store event ID\n");
+ return -EINVAL;
+ }
+ return perf_header__push_event(id, orgname);
+}
static enum event_result parse_tracepoint_event(const char **strp,
struct perf_event_attr *attr)
@@ -533,9 +558,13 @@ static enum event_result parse_tracepoint_event(const char **strp,
*strp += strlen(sys_name) + evt_length;
return parse_multiple_tracepoint_event(sys_name, evt_name,
flags);
- } else
+ } else {
+ if (store_event_type(evt_name) < 0)
+ return EVT_FAILED;
+
return parse_single_tracepoint_event(sys_name, evt_name,
evt_length, attr, strp);
+ }
}
static enum event_result
@@ -778,41 +807,11 @@ modifier:
return ret;
}
-static int store_event_type(const char *orgname)
-{
- char filename[PATH_MAX], *c;
- FILE *file;
- int id, n;
-
- sprintf(filename, "%s/", debugfs_path);
- strncat(filename, orgname, strlen(orgname));
- strcat(filename, "/id");
-
- c = strchr(filename, ':');
- if (c)
- *c = '/';
-
- file = fopen(filename, "r");
- if (!file)
- return 0;
- n = fscanf(file, "%i", &id);
- fclose(file);
- if (n < 1) {
- pr_err("cannot store event ID\n");
- return -EINVAL;
- }
- return perf_header__push_event(id, orgname);
-}
-
int parse_events(const struct option *opt __used, const char *str, int unset __used)
{
struct perf_event_attr attr;
enum event_result ret;
- if (strchr(str, ':'))
- if (store_event_type(str) < 0)
- return -1;
-
for (;;) {
memset(&attr, 0, sizeof(attr));
ret = parse_event_symbols(&str, &attr);
@@ -824,7 +823,7 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
if (ret != EVT_HANDLED_ALL) {
struct perf_evsel *evsel;
- evsel = perf_evsel__new(attr.type, attr.config,
+ evsel = perf_evsel__new(&attr,
nr_counters);
if (evsel == NULL)
return -1;
@@ -1014,8 +1013,15 @@ void print_events(void)
int perf_evsel_list__create_default(void)
{
- struct perf_evsel *evsel = perf_evsel__new(PERF_TYPE_HARDWARE,
- PERF_COUNT_HW_CPU_CYCLES, 0);
+ struct perf_evsel *evsel;
+ struct perf_event_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.type = PERF_TYPE_HARDWARE;
+ attr.config = PERF_COUNT_HW_CPU_CYCLES;
+
+ evsel = perf_evsel__new(&attr, 0);
+
if (evsel == NULL)
return -ENOMEM;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 6fb4694..313dac2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1007,7 +1007,7 @@ more:
if (size == 0)
size = 8;
- if (head + event->header.size >= mmap_size) {
+ if (head + event->header.size > mmap_size) {
if (mmaps[map_idx]) {
munmap(mmaps[map_idx], mmap_size);
mmaps[map_idx] = NULL;
OpenPOWER on IntegriCloud